summaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/brd.c14
-rw-r--r--drivers/block/cciss.c3
-rw-r--r--drivers/block/drbd/drbd_actlog.c61
-rw-r--r--drivers/block/drbd/drbd_bitmap.c92
-rw-r--r--drivers/block/drbd/drbd_debugfs.c17
-rw-r--r--drivers/block/drbd/drbd_int.h57
-rw-r--r--drivers/block/drbd/drbd_interval.h14
-rw-r--r--drivers/block/drbd/drbd_main.c135
-rw-r--r--drivers/block/drbd/drbd_nl.c282
-rw-r--r--drivers/block/drbd/drbd_proc.c30
-rw-r--r--drivers/block/drbd/drbd_protocol.h79
-rw-r--r--drivers/block/drbd/drbd_receiver.c569
-rw-r--r--drivers/block/drbd/drbd_req.c120
-rw-r--r--drivers/block/drbd/drbd_req.h5
-rw-r--r--drivers/block/drbd/drbd_state.c61
-rw-r--r--drivers/block/drbd/drbd_state.h2
-rw-r--r--drivers/block/drbd/drbd_strings.c8
-rw-r--r--drivers/block/drbd/drbd_worker.c118
-rw-r--r--drivers/block/floppy.c6
-rw-r--r--drivers/block/loop.c15
-rw-r--r--drivers/block/mg_disk.c9
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c7
-rw-r--r--drivers/block/nbd.c4
-rw-r--r--drivers/block/null_blk.c2
-rw-r--r--drivers/block/osdblk.c2
-rw-r--r--drivers/block/pktcdvd.c4
-rw-r--r--drivers/block/ps3disk.c7
-rw-r--r--drivers/block/ps3vram.c3
-rw-r--r--drivers/block/rbd.c4
-rw-r--r--drivers/block/rsxx/dev.c4
-rw-r--r--drivers/block/rsxx/dma.c2
-rw-r--r--drivers/block/skd_main.c10
-rw-r--r--drivers/block/sunvdc.c3
-rw-r--r--drivers/block/umem.c8
-rw-r--r--drivers/block/virtio_blk.c26
-rw-r--r--drivers/block/xen-blkback/blkback.c27
-rw-r--r--drivers/block/xen-blkback/xenbus.c22
-rw-r--r--drivers/block/xen-blkfront.c117
-rw-r--r--drivers/block/zram/Kconfig15
-rw-r--r--drivers/block/zram/Makefile4
-rw-r--r--drivers/block/zram/zcomp.c150
-rw-r--r--drivers/block/zram/zcomp.h36
-rw-r--r--drivers/block/zram/zcomp_lz4.c56
-rw-r--r--drivers/block/zram/zcomp_lz4.h17
-rw-r--r--drivers/block/zram/zcomp_lzo.c56
-rw-r--r--drivers/block/zram/zcomp_lzo.h17
-rw-r--r--drivers/block/zram/zram_drv.c50
-rw-r--r--drivers/block/zram/zram_drv.h5
48 files changed, 1565 insertions, 790 deletions
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index c04bd9bc39fd..3022dad24071 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -339,7 +339,7 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))
goto io_error;
- if (unlikely(bio->bi_rw & REQ_DISCARD)) {
+ if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
if (sector & ((PAGE_SIZE >> SECTOR_SHIFT) - 1) ||
bio->bi_iter.bi_size & ~PAGE_MASK)
goto io_error;
@@ -347,9 +347,7 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
goto out;
}
- rw = bio_rw(bio);
- if (rw == READA)
- rw = READ;
+ rw = bio_data_dir(bio);
bio_for_each_segment(bvec, bio, iter) {
unsigned int len = bvec.bv_len;
@@ -381,7 +379,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
#ifdef CONFIG_BLK_DEV_RAM_DAX
static long brd_direct_access(struct block_device *bdev, sector_t sector,
- void __pmem **kaddr, pfn_t *pfn, long size)
+ void **kaddr, pfn_t *pfn, long size)
{
struct brd_device *brd = bdev->bd_disk->private_data;
struct page *page;
@@ -391,7 +389,7 @@ static long brd_direct_access(struct block_device *bdev, sector_t sector,
page = brd_insert_page(brd, sector);
if (!page)
return -ENOSPC;
- *kaddr = (void __pmem *)page_address(page);
+ *kaddr = page_address(page);
*pfn = page_to_pfn_t(page);
return PAGE_SIZE;
@@ -509,7 +507,9 @@ static struct brd_device *brd_alloc(int i)
blk_queue_max_discard_sectors(brd->brd_queue, UINT_MAX);
brd->brd_queue->limits.discard_zeroes_data = 1;
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
-
+#ifdef CONFIG_BLK_DEV_RAM_DAX
+ queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
+#endif
disk = brd->brd_disk = alloc_disk(max_part);
if (!disk)
goto out_free_queue;
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 63c2064689f8..db9d6bb6352d 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1951,7 +1951,6 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
if (cciss_create_ld_sysfs_entry(h, drv_index))
goto cleanup_queue;
disk->private_data = h->drv[drv_index];
- disk->driverfs_dev = &h->drv[drv_index]->dev;
/* Set up queue information */
blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask);
@@ -1973,7 +1972,7 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
/* allows the interrupt handler to start the queue */
wmb();
h->drv[drv_index]->queue = disk->queue;
- add_disk(disk);
+ device_add_disk(&h->drv[drv_index]->dev, disk);
return 0;
cleanup_queue:
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 10459a145062..0a1aaf8c24c4 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -137,19 +137,19 @@ void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_b
static int _drbd_md_sync_page_io(struct drbd_device *device,
struct drbd_backing_dev *bdev,
- sector_t sector, int rw)
+ sector_t sector, int op)
{
struct bio *bio;
/* we do all our meta data IO in aligned 4k blocks. */
const int size = 4096;
- int err;
+ int err, op_flags = 0;
device->md_io.done = 0;
device->md_io.error = -ENODEV;
- if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags))
- rw |= REQ_FUA | REQ_FLUSH;
- rw |= REQ_SYNC | REQ_NOIDLE;
+ if ((op == REQ_OP_WRITE) && !test_bit(MD_NO_FUA, &device->flags))
+ op_flags |= REQ_FUA | REQ_PREFLUSH;
+ op_flags |= REQ_SYNC | REQ_NOIDLE;
bio = bio_alloc_drbd(GFP_NOIO);
bio->bi_bdev = bdev->md_bdev;
@@ -159,9 +159,9 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
goto out;
bio->bi_private = device;
bio->bi_end_io = drbd_md_endio;
- bio->bi_rw = rw;
+ bio_set_op_attrs(bio, op, op_flags);
- if (!(rw & WRITE) && device->state.disk == D_DISKLESS && device->ldev == NULL)
+ if (op != REQ_OP_WRITE && device->state.disk == D_DISKLESS && device->ldev == NULL)
/* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */
;
else if (!get_ldev_if_state(device, D_ATTACHING)) {
@@ -174,10 +174,10 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
bio_get(bio); /* one bio_put() is in the completion handler */
atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */
device->md_io.submit_jif = jiffies;
- if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
+ if (drbd_insert_fault(device, (op == REQ_OP_WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
bio_io_error(bio);
else
- submit_bio(rw, bio);
+ submit_bio(bio);
wait_until_done_or_force_detached(device, bdev, &device->md_io.done);
if (!bio->bi_error)
err = device->md_io.error;
@@ -188,7 +188,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device,
}
int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev,
- sector_t sector, int rw)
+ sector_t sector, int op)
{
int err;
D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1);
@@ -197,19 +197,21 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd
dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
current->comm, current->pid, __func__,
- (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ",
+ (unsigned long long)sector, (op == REQ_OP_WRITE) ? "WRITE" : "READ",
(void*)_RET_IP_ );
if (sector < drbd_md_first_sector(bdev) ||
sector + 7 > drbd_md_last_sector(bdev))
drbd_alert(device, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
current->comm, current->pid, __func__,
- (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+ (unsigned long long)sector,
+ (op == REQ_OP_WRITE) ? "WRITE" : "READ");
- err = _drbd_md_sync_page_io(device, bdev, sector, rw);
+ err = _drbd_md_sync_page_io(device, bdev, sector, op);
if (err) {
drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
- (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
+ (unsigned long long)sector,
+ (op == REQ_OP_WRITE) ? "WRITE" : "READ", err);
}
return err;
}
@@ -256,7 +258,7 @@ bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval
unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
- D_ASSERT(device, (unsigned)(last - first) <= 1);
+ D_ASSERT(device, first <= last);
D_ASSERT(device, atomic_read(&device->local_cnt) > 0);
/* FIXME figure out a fast path for bios crossing AL extent boundaries */
@@ -339,6 +341,8 @@ static int __al_write_transaction(struct drbd_device *device, struct al_transact
i = 0;
+ drbd_bm_reset_al_hints(device);
+
/* Even though no one can start to change this list
* once we set the LC_LOCKED -- from drbd_al_begin_io(),
* lc_try_lock_for_transaction() --, someone may still
@@ -768,10 +772,18 @@ static bool lazy_bitmap_update_due(struct drbd_device *device)
static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done)
{
- if (rs_done)
- set_bit(RS_DONE, &device->flags);
- /* and also set RS_PROGRESS below */
- else if (!lazy_bitmap_update_due(device))
+ if (rs_done) {
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+ if (connection->agreed_pro_version <= 95 ||
+ is_sync_target_state(device->state.conn))
+ set_bit(RS_DONE, &device->flags);
+ /* and also set RS_PROGRESS below */
+
+ /* Else: rather wait for explicit notification via receive_state,
+ * to avoid uuids-rotated-too-fast causing full resync
+ * in next handshake, in case the replication link breaks
+ * at the most unfortunate time... */
+ } else if (!lazy_bitmap_update_due(device))
return;
drbd_device_post_work(device, RS_PROGRESS);
@@ -830,6 +842,13 @@ static int update_sync_bits(struct drbd_device *device,
return count;
}
+static bool plausible_request_size(int size)
+{
+ return size > 0
+ && size <= DRBD_MAX_BATCH_BIO_SIZE
+ && IS_ALIGNED(size, 512);
+}
+
/* clear the bit corresponding to the piece of storage in question:
* size byte of data starting from sector. Only clear a bits of the affected
* one ore more _aligned_ BM_BLOCK_SIZE blocks.
@@ -845,11 +864,11 @@ int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
unsigned long count = 0;
sector_t esector, nr_sectors;
- /* This would be an empty REQ_FLUSH, be silent. */
+ /* This would be an empty REQ_PREFLUSH, be silent. */
if ((mode == SET_OUT_OF_SYNC) && size == 0)
return 0;
- if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
+ if (!plausible_request_size(size)) {
drbd_err(device, "%s: sector=%llus size=%d nonsense!\n",
drbd_change_sync_fname[mode],
(unsigned long long)sector, size);
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 92d6fc020a65..ab62b81c2ca7 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -96,6 +96,13 @@ struct drbd_bitmap {
struct page **bm_pages;
spinlock_t bm_lock;
+ /* exclusively to be used by __al_write_transaction(),
+ * drbd_bm_mark_for_writeout() and
+ * and drbd_bm_write_hinted() -> bm_rw() called from there.
+ */
+ unsigned int n_bitmap_hints;
+ unsigned int al_bitmap_hints[AL_UPDATES_PER_TRANSACTION];
+
/* see LIMITATIONS: above */
unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
@@ -242,6 +249,11 @@ static void bm_set_page_need_writeout(struct page *page)
set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
}
+void drbd_bm_reset_al_hints(struct drbd_device *device)
+{
+ device->bitmap->n_bitmap_hints = 0;
+}
+
/**
* drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
* @device: DRBD device.
@@ -253,6 +265,7 @@ static void bm_set_page_need_writeout(struct page *page)
*/
void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
{
+ struct drbd_bitmap *b = device->bitmap;
struct page *page;
if (page_nr >= device->bitmap->bm_number_of_pages) {
drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n",
@@ -260,7 +273,9 @@ void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
return;
}
page = device->bitmap->bm_pages[page_nr];
- set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page));
+ BUG_ON(b->n_bitmap_hints >= ARRAY_SIZE(b->al_bitmap_hints));
+ if (!test_and_set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)))
+ b->al_bitmap_hints[b->n_bitmap_hints++] = page_nr;
}
static int bm_test_page_unchanged(struct page *page)
@@ -427,8 +442,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
}
/*
- * called on driver init only. TODO call when a device is created.
- * allocates the drbd_bitmap, and stores it in device->bitmap.
+ * allocates the drbd_bitmap and stores it in device->bitmap.
*/
int drbd_bm_init(struct drbd_device *device)
{
@@ -633,7 +647,8 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
unsigned long bits, words, owords, obits;
unsigned long want, have, onpages; /* number of pages */
struct page **npages, **opages = NULL;
- int err = 0, growing;
+ int err = 0;
+ bool growing;
if (!expect(b))
return -ENOMEM;
@@ -980,7 +995,7 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho
struct drbd_bitmap *b = device->bitmap;
struct page *page;
unsigned int len;
- unsigned int rw = (ctx->flags & BM_AIO_READ) ? READ : WRITE;
+ unsigned int op = (ctx->flags & BM_AIO_READ) ? REQ_OP_READ : REQ_OP_WRITE;
sector_t on_disk_sector =
device->ldev->md.md_offset + device->ldev->md.bm_offset;
@@ -1011,12 +1026,12 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho
bio_add_page(bio, page, len, 0);
bio->bi_private = ctx;
bio->bi_end_io = drbd_bm_endio;
+ bio_set_op_attrs(bio, op, 0);
- if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
- bio->bi_rw |= rw;
+ if (drbd_insert_fault(device, (op == REQ_OP_WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
bio_io_error(bio);
} else {
- submit_bio(rw, bio);
+ submit_bio(bio);
/* this should not count as user activity and cause the
* resync to throttle -- see drbd_rs_should_slow_down(). */
atomic_add(len >> 9, &device->rs_sect_ev);
@@ -1030,7 +1045,7 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
{
struct drbd_bm_aio_ctx *ctx;
struct drbd_bitmap *b = device->bitmap;
- int num_pages, i, count = 0;
+ unsigned int num_pages, i, count = 0;
unsigned long now;
char ppb[10];
int err = 0;
@@ -1078,16 +1093,37 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
now = jiffies;
/* let the layers below us try to merge these bios... */
- for (i = 0; i < num_pages; i++) {
- /* ignore completely unchanged pages */
- if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
- break;
- if (!(flags & BM_AIO_READ)) {
- if ((flags & BM_AIO_WRITE_HINTED) &&
- !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
- &page_private(b->bm_pages[i])))
- continue;
+ if (flags & BM_AIO_READ) {
+ for (i = 0; i < num_pages; i++) {
+ atomic_inc(&ctx->in_flight);
+ bm_page_io_async(ctx, i);
+ ++count;
+ cond_resched();
+ }
+ } else if (flags & BM_AIO_WRITE_HINTED) {
+ /* ASSERT: BM_AIO_WRITE_ALL_PAGES is not set. */
+ unsigned int hint;
+ for (hint = 0; hint < b->n_bitmap_hints; hint++) {
+ i = b->al_bitmap_hints[hint];
+ if (i >= num_pages) /* == -1U: no hint here. */
+ continue;
+ /* Several AL-extents may point to the same page. */
+ if (!test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
+ &page_private(b->bm_pages[i])))
+ continue;
+ /* Has it even changed? */
+ if (bm_test_page_unchanged(b->bm_pages[i]))
+ continue;
+ atomic_inc(&ctx->in_flight);
+ bm_page_io_async(ctx, i);
+ ++count;
+ }
+ } else {
+ for (i = 0; i < num_pages; i++) {
+ /* ignore completely unchanged pages */
+ if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
+ break;
if (!(flags & BM_AIO_WRITE_ALL_PAGES) &&
bm_test_page_unchanged(b->bm_pages[i])) {
dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
@@ -1100,11 +1136,11 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i);
continue;
}
+ atomic_inc(&ctx->in_flight);
+ bm_page_io_async(ctx, i);
+ ++count;
+ cond_resched();
}
- atomic_inc(&ctx->in_flight);
- bm_page_io_async(ctx, i);
- ++count;
- cond_resched();
}
/*
@@ -1121,10 +1157,14 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
/* summary for global bitmap IO */
- if (flags == 0)
- drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n",
- (flags & BM_AIO_READ) ? "READ" : "WRITE",
- count, jiffies - now);
+ if (flags == 0) {
+ unsigned int ms = jiffies_to_msecs(jiffies - now);
+ if (ms > 5) {
+ drbd_info(device, "bitmap %s of %u pages took %u ms\n",
+ (flags & BM_AIO_READ) ? "READ" : "WRITE",
+ count, ms);
+ }
+ }
if (ctx->error) {
drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n");
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index 4de95bbff486..de5c3ee8a790 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -237,14 +237,9 @@ static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_re
seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
- if (f & EE_IS_TRIM) {
- seq_putc(m, sep);
- sep = '|';
- if (f & EE_IS_TRIM_USE_ZEROOUT)
- seq_puts(m, "zero-out");
- else
- seq_puts(m, "trim");
- }
+ if (f & EE_IS_TRIM)
+ __seq_print_rq_state_bit(m, f & EE_IS_TRIM_USE_ZEROOUT, &sep, "zero-out", "trim");
+ seq_print_rq_state_bit(m, f & EE_WRITE_SAME, &sep, "write-same");
seq_putc(m, '\n');
}
@@ -430,9 +425,6 @@ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, vo
/* Are we still linked,
* or has debugfs_remove() already been called? */
parent = file->f_path.dentry->d_parent;
- /* not sure if this can happen: */
- if (!parent || d_really_is_negative(parent))
- goto out;
/* serialize with d_delete() */
inode_lock(d_inode(parent));
/* Make sure the object is still alive */
@@ -445,7 +437,6 @@ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, vo
if (ret)
kref_put(kref, release);
}
-out:
return ret;
}
@@ -908,7 +899,7 @@ static int drbd_version_open(struct inode *inode, struct file *file)
return single_open(file, drbd_version_show, NULL);
}
-static struct file_operations drbd_version_fops = {
+static const struct file_operations drbd_version_fops = {
.owner = THIS_MODULE,
.open = drbd_version_open,
.llseek = seq_lseek,
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 7a1cf7eaa71d..7b54354976a5 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -468,9 +468,15 @@ enum {
/* this is/was a write request */
__EE_WRITE,
+ /* this is/was a write same request */
+ __EE_WRITE_SAME,
+
/* this originates from application on peer
* (not some resync or verify or other DRBD internal request) */
__EE_APPLICATION,
+
+ /* If it contains only 0 bytes, send back P_RS_DEALLOCATED */
+ __EE_RS_THIN_REQ,
};
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
@@ -484,7 +490,9 @@ enum {
#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE)
#define EE_SUBMITTED (1<<__EE_SUBMITTED)
#define EE_WRITE (1<<__EE_WRITE)
+#define EE_WRITE_SAME (1<<__EE_WRITE_SAME)
#define EE_APPLICATION (1<<__EE_APPLICATION)
+#define EE_RS_THIN_REQ (1<<__EE_RS_THIN_REQ)
/* flag bits per device */
enum {
@@ -1123,6 +1131,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int
extern int drbd_send_bitmap(struct drbd_device *device);
extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
+extern int drbd_send_rs_deallocated(struct drbd_peer_device *, struct drbd_peer_request *);
extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev);
extern void drbd_device_cleanup(struct drbd_device *device);
void drbd_print_uuids(struct drbd_device *device, const char *text);
@@ -1327,14 +1336,14 @@ struct bm_extent {
#endif
#endif
-/* BIO_MAX_SIZE is 256 * PAGE_SIZE,
+/* Estimate max bio size as 256 * PAGE_SIZE,
* so for typical PAGE_SIZE of 4k, that is (1<<20) Byte.
* Since we may live in a mixed-platform cluster,
* we limit us to a platform agnostic constant here for now.
* A followup commit may allow even bigger BIO sizes,
* once we thought that through. */
#define DRBD_MAX_BIO_SIZE (1U << 20)
-#if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#if DRBD_MAX_BIO_SIZE > (BIO_MAX_PAGES << PAGE_SHIFT)
#error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
#endif
#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */
@@ -1342,11 +1351,11 @@ struct bm_extent {
#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
-/* For now, don't allow more than one activity log extent worth of data
- * to be discarded in one go. We may need to rework drbd_al_begin_io()
- * to allow for even larger discard ranges */
-#define DRBD_MAX_DISCARD_SIZE AL_EXTENT_SIZE
-#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9)
+/* For now, don't allow more than half of what we can "activate" in one
+ * activity log transaction to be discarded in one go. We may need to rework
+ * drbd_al_begin_io() to allow for even larger discard ranges */
+#define DRBD_MAX_BATCH_BIO_SIZE (AL_UPDATES_PER_TRANSACTION/2*AL_EXTENT_SIZE)
+#define DRBD_MAX_BBIO_SECTORS (DRBD_MAX_BATCH_BIO_SIZE >> 9)
extern int drbd_bm_init(struct drbd_device *device);
extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits);
@@ -1369,6 +1378,7 @@ extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr);
extern int drbd_bm_read(struct drbd_device *device) __must_hold(local);
extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr);
extern int drbd_bm_write(struct drbd_device *device) __must_hold(local);
+extern void drbd_bm_reset_al_hints(struct drbd_device *device) __must_hold(local);
extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local);
extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local);
extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local);
@@ -1483,12 +1493,14 @@ enum determine_dev_size {
extern enum determine_dev_size
drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local);
extern void resync_after_online_grow(struct drbd_device *);
-extern void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev);
+extern void drbd_reconsider_queue_parameters(struct drbd_device *device,
+ struct drbd_backing_dev *bdev, struct o_qlim *o);
extern enum drbd_state_rv drbd_set_role(struct drbd_device *device,
enum drbd_role new_role,
int force);
extern bool conn_try_outdate_peer(struct drbd_connection *connection);
extern void conn_try_outdate_peer_async(struct drbd_connection *connection);
+extern enum drbd_peer_state conn_khelper(struct drbd_connection *connection, char *cmd);
extern int drbd_khelper(struct drbd_device *device, char *cmd);
/* drbd_worker.c */
@@ -1507,7 +1519,7 @@ extern int drbd_resync_finished(struct drbd_device *device);
extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent);
extern void drbd_md_put_buffer(struct drbd_device *device);
extern int drbd_md_sync_page_io(struct drbd_device *device,
- struct drbd_backing_dev *bdev, sector_t sector, int rw);
+ struct drbd_backing_dev *bdev, sector_t sector, int op);
extern void drbd_ov_out_of_sync_found(struct drbd_device *, sector_t, int);
extern void wait_until_done_or_force_detached(struct drbd_device *device,
struct drbd_backing_dev *bdev, unsigned int *done);
@@ -1548,6 +1560,8 @@ extern void start_resync_timer_fn(unsigned long data);
extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
/* drbd_receiver.c */
+extern int drbd_issue_discard_or_zero_out(struct drbd_device *device,
+ sector_t start, unsigned int nr_sectors, bool discard);
extern int drbd_receiver(struct drbd_thread *thi);
extern int drbd_ack_receiver(struct drbd_thread *thi);
extern void drbd_send_ping_wf(struct work_struct *ws);
@@ -1557,11 +1571,11 @@ extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector
bool throttle_if_app_is_waiting);
extern int drbd_submit_peer_request(struct drbd_device *,
struct drbd_peer_request *, const unsigned,
- const int);
+ const unsigned, const int);
extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
sector_t, unsigned int,
- bool,
+ unsigned int,
gfp_t) __must_hold(local);
extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
int);
@@ -1635,8 +1649,6 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
/* drbd_proc.c */
extern struct proc_dir_entry *drbd_proc;
extern const struct file_operations drbd_proc_fops;
-extern const char *drbd_conn_str(enum drbd_conns s);
-extern const char *drbd_role_str(enum drbd_role s);
/* drbd_actlog.c */
extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
@@ -2095,13 +2107,22 @@ static inline void _sub_unacked(struct drbd_device *device, int n, const char *f
ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
}
+static inline bool is_sync_target_state(enum drbd_conns connection_state)
+{
+ return connection_state == C_SYNC_TARGET ||
+ connection_state == C_PAUSED_SYNC_T;
+}
+
+static inline bool is_sync_source_state(enum drbd_conns connection_state)
+{
+ return connection_state == C_SYNC_SOURCE ||
+ connection_state == C_PAUSED_SYNC_S;
+}
+
static inline bool is_sync_state(enum drbd_conns connection_state)
{
- return
- (connection_state == C_SYNC_SOURCE
- || connection_state == C_SYNC_TARGET
- || connection_state == C_PAUSED_SYNC_S
- || connection_state == C_PAUSED_SYNC_T);
+ return is_sync_source_state(connection_state) ||
+ is_sync_target_state(connection_state);
}
/**
diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h
index f210543f05f4..23c5a94428d2 100644
--- a/drivers/block/drbd/drbd_interval.h
+++ b/drivers/block/drbd/drbd_interval.h
@@ -6,13 +6,13 @@
struct drbd_interval {
struct rb_node rb;
- sector_t sector; /* start sector of the interval */
- unsigned int size; /* size in bytes */
- sector_t end; /* highest interval end in subtree */
- int local:1 /* local or remote request? */;
- int waiting:1; /* someone is waiting for this to complete */
- int completed:1; /* this has been completed already;
- * ignore for conflict detection */
+ sector_t sector; /* start sector of the interval */
+ unsigned int size; /* size in bytes */
+ sector_t end; /* highest interval end in subtree */
+ unsigned int local:1 /* local or remote request? */;
+ unsigned int waiting:1; /* someone is waiting for completion */
+ unsigned int completed:1; /* this has been completed already;
+ * ignore for conflict detection */
};
static inline void drbd_clear_interval(struct drbd_interval *i)
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 2ba1494b2799..0501ae0c517b 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -31,7 +31,7 @@
#include <linux/module.h>
#include <linux/jiffies.h>
#include <linux/drbd.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <asm/types.h>
#include <net/sock.h>
#include <linux/ctype.h>
@@ -920,6 +920,31 @@ void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device)
}
}
+/* communicated if (agreed_features & DRBD_FF_WSAME) */
+void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct request_queue *q)
+{
+ if (q) {
+ p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q));
+ p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q));
+ p->qlim->alignment_offset = cpu_to_be32(queue_alignment_offset(q));
+ p->qlim->io_min = cpu_to_be32(queue_io_min(q));
+ p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
+ p->qlim->discard_enabled = blk_queue_discard(q);
+ p->qlim->discard_zeroes_data = queue_discard_zeroes_data(q);
+ p->qlim->write_same_capable = !!q->limits.max_write_same_sectors;
+ } else {
+ q = device->rq_queue;
+ p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q));
+ p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q));
+ p->qlim->alignment_offset = 0;
+ p->qlim->io_min = cpu_to_be32(queue_io_min(q));
+ p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
+ p->qlim->discard_enabled = 0;
+ p->qlim->discard_zeroes_data = 0;
+ p->qlim->write_same_capable = 0;
+ }
+}
+
int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags)
{
struct drbd_device *device = peer_device->device;
@@ -928,29 +953,37 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu
sector_t d_size, u_size;
int q_order_type;
unsigned int max_bio_size;
+ unsigned int packet_size;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ packet_size = sizeof(*p);
+ if (peer_device->connection->agreed_features & DRBD_FF_WSAME)
+ packet_size += sizeof(p->qlim[0]);
+
+ memset(p, 0, packet_size);
if (get_ldev_if_state(device, D_NEGOTIATING)) {
- D_ASSERT(device, device->ldev->backing_bdev);
+ struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
d_size = drbd_get_max_capacity(device->ldev);
rcu_read_lock();
u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
rcu_read_unlock();
q_order_type = drbd_queue_order_type(device);
- max_bio_size = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9;
+ max_bio_size = queue_max_hw_sectors(q) << 9;
max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
+ assign_p_sizes_qlim(device, p, q);
put_ldev(device);
} else {
d_size = 0;
u_size = 0;
q_order_type = QUEUE_ORDERED_NONE;
max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
+ assign_p_sizes_qlim(device, p, NULL);
}
- sock = &peer_device->connection->data;
- p = drbd_prepare_command(peer_device, sock);
- if (!p)
- return -EIO;
-
if (peer_device->connection->agreed_pro_version <= 94)
max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
else if (peer_device->connection->agreed_pro_version < 100)
@@ -962,7 +995,8 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu
p->max_bio_size = cpu_to_be32(max_bio_size);
p->queue_order_type = cpu_to_be16(q_order_type);
p->dds_flags = cpu_to_be16(flags);
- return drbd_send_command(peer_device, sock, P_SIZES, sizeof(*p), NULL, 0);
+
+ return drbd_send_command(peer_device, sock, P_SIZES, packet_size, NULL, 0);
}
/**
@@ -1377,6 +1411,22 @@ int drbd_send_ack_ex(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
cpu_to_be64(block_id));
}
+int drbd_send_rs_deallocated(struct drbd_peer_device *peer_device,
+ struct drbd_peer_request *peer_req)
+{
+ struct drbd_socket *sock;
+ struct p_block_desc *p;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(peer_req->i.sector);
+ p->blksize = cpu_to_be32(peer_req->i.size);
+ p->pad = 0;
+ return drbd_send_command(peer_device, sock, P_RS_DEALLOCATED, sizeof(*p), NULL, 0);
+}
+
int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd,
sector_t sector, int size, u64 block_id)
{
@@ -1561,6 +1611,9 @@ static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio)
? 0 : MSG_MORE);
if (err)
return err;
+ /* REQ_OP_WRITE_SAME has only one segment */
+ if (bio_op(bio) == REQ_OP_WRITE_SAME)
+ break;
}
return 0;
}
@@ -1579,6 +1632,9 @@ static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *b
bio_iter_last(bvec, iter) ? 0 : MSG_MORE);
if (err)
return err;
+ /* REQ_OP_WRITE_SAME has only one segment */
+ if (bio_op(bio) == REQ_OP_WRITE_SAME)
+ break;
}
return 0;
}
@@ -1603,15 +1659,17 @@ static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device,
return 0;
}
-static u32 bio_flags_to_wire(struct drbd_connection *connection, unsigned long bi_rw)
+static u32 bio_flags_to_wire(struct drbd_connection *connection,
+ struct bio *bio)
{
if (connection->agreed_pro_version >= 95)
- return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
- (bi_rw & REQ_FUA ? DP_FUA : 0) |
- (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
- (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
+ return (bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
+ (bio->bi_rw & REQ_FUA ? DP_FUA : 0) |
+ (bio->bi_rw & REQ_PREFLUSH ? DP_FLUSH : 0) |
+ (bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) |
+ (bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0);
else
- return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
+ return bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
}
/* Used to send write or TRIM aka REQ_DISCARD requests
@@ -1622,6 +1680,8 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
struct drbd_device *device = peer_device->device;
struct drbd_socket *sock;
struct p_data *p;
+ struct p_wsame *wsame = NULL;
+ void *digest_out;
unsigned int dp_flags = 0;
int digest_size;
int err;
@@ -1636,7 +1696,7 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
p->sector = cpu_to_be64(req->i.sector);
p->block_id = (unsigned long)req;
p->seq_num = cpu_to_be32(atomic_inc_return(&device->packet_seq));
- dp_flags = bio_flags_to_wire(peer_device->connection, req->master_bio->bi_rw);
+ dp_flags = bio_flags_to_wire(peer_device->connection, req->master_bio);
if (device->state.conn >= C_SYNC_SOURCE &&
device->state.conn <= C_PAUSED_SYNC_T)
dp_flags |= DP_MAY_SET_IN_SYNC;
@@ -1657,12 +1717,29 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0);
goto out;
}
+ if (dp_flags & DP_WSAME) {
+ /* this will only work if DRBD_FF_WSAME is set AND the
+ * handshake agreed that all nodes and backend devices are
+ * WRITE_SAME capable and agree on logical_block_size */
+ wsame = (struct p_wsame*)p;
+ digest_out = wsame + 1;
+ wsame->size = cpu_to_be32(req->i.size);
+ } else
+ digest_out = p + 1;
/* our digest is still only over the payload.
* TRIM does not carry any payload. */
if (digest_size)
- drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1);
- err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + digest_size, NULL, req->i.size);
+ drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest_out);
+ if (wsame) {
+ err =
+ __send_command(peer_device->connection, device->vnr, sock, P_WSAME,
+ sizeof(*wsame) + digest_size, NULL,
+ bio_iovec(req->master_bio).bv_len);
+ } else
+ err =
+ __send_command(peer_device->connection, device->vnr, sock, P_DATA,
+ sizeof(*p) + digest_size, NULL, req->i.size);
if (!err) {
/* For protocol A, we have to memcpy the payload into
* socket buffers, as we may complete right away
@@ -3061,7 +3138,7 @@ void drbd_md_write(struct drbd_device *device, void *b)
D_ASSERT(device, drbd_md_ss(device->ldev) == device->ldev->md.md_offset);
sector = device->ldev->md.md_offset;
- if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
+ if (drbd_md_sync_page_io(device, device->ldev, sector, REQ_OP_WRITE)) {
/* this was a try anyways ... */
drbd_err(device, "meta data update failed!\n");
drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
@@ -3263,7 +3340,8 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
* Affects the paranoia out-of-range access check in drbd_md_sync_page_io(). */
bdev->md.md_size_sect = 8;
- if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) {
+ if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset,
+ REQ_OP_READ)) {
/* NOTE: can't do normal error processing here as this is
called BEFORE disk is attached */
drbd_err(device, "Error while reading metadata.\n");
@@ -3505,7 +3583,12 @@ static int w_bitmap_io(struct drbd_work *w, int unused)
struct bm_io_work *work = &device->bm_io_work;
int rv = -EIO;
- D_ASSERT(device, atomic_read(&device->ap_bio_cnt) == 0);
+ if (work->flags != BM_LOCKED_CHANGE_ALLOWED) {
+ int cnt = atomic_read(&device->ap_bio_cnt);
+ if (cnt)
+ drbd_err(device, "FIXME: ap_bio_cnt %d, expected 0; queued for '%s'\n",
+ cnt, work->why);
+ }
if (get_ldev(device)) {
drbd_bm_lock(device, work->why, work->flags);
@@ -3585,18 +3668,20 @@ void drbd_queue_bitmap_io(struct drbd_device *device,
int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *),
char *why, enum bm_flag flags)
{
+ /* Only suspend io, if some operation is supposed to be locked out */
+ const bool do_suspend_io = flags & (BM_DONT_CLEAR|BM_DONT_SET|BM_DONT_TEST);
int rv;
D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
- if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
+ if (do_suspend_io)
drbd_suspend_io(device);
drbd_bm_lock(device, why, flags);
rv = io_fn(device);
drbd_bm_unlock(device);
- if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
+ if (do_suspend_io)
drbd_resume_io(device);
return rv;
@@ -3635,6 +3720,8 @@ const char *cmdname(enum drbd_packet cmd)
* one PRO_VERSION */
static const char *cmdnames[] = {
[P_DATA] = "Data",
+ [P_WSAME] = "WriteSame",
+ [P_TRIM] = "Trim",
[P_DATA_REPLY] = "DataReply",
[P_RS_DATA_REPLY] = "RSDataReply",
[P_BARRIER] = "Barrier",
@@ -3679,6 +3766,8 @@ const char *cmdname(enum drbd_packet cmd)
[P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply",
[P_RETRY_WRITE] = "retry_write",
[P_PROTOCOL_UPDATE] = "protocol_update",
+ [P_RS_THIN_REQ] = "rs_thin_req",
+ [P_RS_DEALLOCATED] = "rs_deallocated",
/* enum drbd_packet, but not commands - obsoleted flags:
* P_MAY_IGNORE
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 0bac9c8246bc..f35db29cac76 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -343,7 +343,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
(char[20]) { }, /* address family */
(char[60]) { }, /* address */
NULL };
- char mb[12];
+ char mb[14];
char *argv[] = {usermode_helper, cmd, mb, NULL };
struct drbd_connection *connection = first_peer_device(device)->connection;
struct sib_info sib;
@@ -352,7 +352,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
if (current == connection->worker.task)
set_bit(CALLBACK_PENDING, &connection->flags);
- snprintf(mb, 12, "minor-%d", device_to_minor(device));
+ snprintf(mb, 14, "minor-%d", device_to_minor(device));
setup_khelper_env(connection, envp);
/* The helper may take some time.
@@ -387,7 +387,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd)
return ret;
}
-static int conn_khelper(struct drbd_connection *connection, char *cmd)
+enum drbd_peer_state conn_khelper(struct drbd_connection *connection, char *cmd)
{
char *envp[] = { "HOME=/",
"TERM=linux",
@@ -442,19 +442,17 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connec
}
rcu_read_unlock();
- if (fp == FP_NOT_AVAIL) {
- /* IO Suspending works on the whole resource.
- Do it only for one device. */
- vnr = 0;
- peer_device = idr_get_next(&connection->peer_devices, &vnr);
- drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0));
- }
-
return fp;
}
+static bool resource_is_supended(struct drbd_resource *resource)
+{
+ return resource->susp || resource->susp_fen || resource->susp_nod;
+}
+
bool conn_try_outdate_peer(struct drbd_connection *connection)
{
+ struct drbd_resource * const resource = connection->resource;
unsigned int connect_cnt;
union drbd_state mask = { };
union drbd_state val = { };
@@ -462,21 +460,41 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
char *ex_to_string;
int r;
- spin_lock_irq(&connection->resource->req_lock);
+ spin_lock_irq(&resource->req_lock);
if (connection->cstate >= C_WF_REPORT_PARAMS) {
drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n");
- spin_unlock_irq(&connection->resource->req_lock);
+ spin_unlock_irq(&resource->req_lock);
return false;
}
connect_cnt = connection->connect_cnt;
- spin_unlock_irq(&connection->resource->req_lock);
+ spin_unlock_irq(&resource->req_lock);
fp = highest_fencing_policy(connection);
switch (fp) {
case FP_NOT_AVAIL:
drbd_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n");
- goto out;
+ spin_lock_irq(&resource->req_lock);
+ if (connection->cstate < C_WF_REPORT_PARAMS) {
+ _conn_request_state(connection,
+ (union drbd_state) { { .susp_fen = 1 } },
+ (union drbd_state) { { .susp_fen = 0 } },
+ CS_VERBOSE | CS_HARD | CS_DC_SUSP);
+ /* We are no longer suspended due to the fencing policy.
+ * We may still be suspended due to the on-no-data-accessible policy.
+ * If that was OND_IO_ERROR, fail pending requests. */
+ if (!resource_is_supended(resource))
+ _tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
+ }
+ /* Else: in case we raced with a connection handshake,
+ * let the handshake figure out if we maybe can RESEND,
+ * and do not resume/fail pending requests here.
+ * Worst case is we stay suspended for now, which may be
+ * resolved by either re-establishing the replication link, or
+ * the next link failure, or eventually the administrator. */
+ spin_unlock_irq(&resource->req_lock);
+ return false;
+
case FP_DONT_CARE:
return true;
default: ;
@@ -485,17 +503,17 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
r = conn_khelper(connection, "fence-peer");
switch ((r>>8) & 0xff) {
- case 3: /* peer is inconsistent */
+ case P_INCONSISTENT: /* peer is inconsistent */
ex_to_string = "peer is inconsistent or worse";
mask.pdsk = D_MASK;
val.pdsk = D_INCONSISTENT;
break;
- case 4: /* peer got outdated, or was already outdated */
+ case P_OUTDATED: /* peer got outdated, or was already outdated */
ex_to_string = "peer was fenced";
mask.pdsk = D_MASK;
val.pdsk = D_OUTDATED;
break;
- case 5: /* peer was down */
+ case P_DOWN: /* peer was down */
if (conn_highest_disk(connection) == D_UP_TO_DATE) {
/* we will(have) create(d) a new UUID anyways... */
ex_to_string = "peer is unreachable, assumed to be dead";
@@ -505,7 +523,7 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
}
break;
- case 6: /* Peer is primary, voluntarily outdate myself.
+ case P_PRIMARY: /* Peer is primary, voluntarily outdate myself.
* This is useful when an unconnected R_SECONDARY is asked to
* become R_PRIMARY, but finds the other peer being active. */
ex_to_string = "peer is active";
@@ -513,7 +531,9 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
mask.disk = D_MASK;
val.disk = D_OUTDATED;
break;
- case 7:
+ case P_FENCING:
+ /* THINK: do we need to handle this
+ * like case 4, or more like case 5? */
if (fp != FP_STONITH)
drbd_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n");
ex_to_string = "peer was stonithed";
@@ -529,13 +549,11 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
drbd_info(connection, "fence-peer helper returned %d (%s)\n",
(r>>8) & 0xff, ex_to_string);
- out:
-
/* Not using
conn_request_state(connection, mask, val, CS_VERBOSE);
here, because we might were able to re-establish the connection in the
meantime. */
- spin_lock_irq(&connection->resource->req_lock);
+ spin_lock_irq(&resource->req_lock);
if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) {
if (connection->connect_cnt != connect_cnt)
/* In case the connection was established and droped
@@ -544,7 +562,7 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
else
_conn_request_state(connection, mask, val, CS_VERBOSE);
}
- spin_unlock_irq(&connection->resource->req_lock);
+ spin_unlock_irq(&resource->req_lock);
return conn_highest_pdsk(connection) <= D_OUTDATED;
}
@@ -1154,51 +1172,160 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
return 0;
}
+static void blk_queue_discard_granularity(struct request_queue *q, unsigned int granularity)
+{
+ q->limits.discard_granularity = granularity;
+}
+
+static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection)
+{
+ /* when we introduced REQ_WRITE_SAME support, we also bumped
+ * our maximum supported batch bio size used for discards. */
+ if (connection->agreed_features & DRBD_FF_WSAME)
+ return DRBD_MAX_BBIO_SECTORS;
+ /* before, with DRBD <= 8.4.6, we only allowed up to one AL_EXTENT_SIZE. */
+ return AL_EXTENT_SIZE >> 9;
+}
+
+static void decide_on_discard_support(struct drbd_device *device,
+ struct request_queue *q,
+ struct request_queue *b,
+ bool discard_zeroes_if_aligned)
+{
+ /* q = drbd device queue (device->rq_queue)
+ * b = backing device queue (device->ldev->backing_bdev->bd_disk->queue),
+ * or NULL if diskless
+ */
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+ bool can_do = b ? blk_queue_discard(b) : true;
+
+ if (can_do && b && !b->limits.discard_zeroes_data && !discard_zeroes_if_aligned) {
+ can_do = false;
+ drbd_info(device, "discard_zeroes_data=0 and discard_zeroes_if_aligned=no: disabling discards\n");
+ }
+ if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) {
+ can_do = false;
+ drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n");
+ }
+ if (can_do) {
+ /* We don't care for the granularity, really.
+ * Stacking limits below should fix it for the local
+ * device. Whether or not it is a suitable granularity
+ * on the remote device is not our problem, really. If
+ * you care, you need to use devices with similar
+ * topology on all peers. */
+ blk_queue_discard_granularity(q, 512);
+ q->limits.max_discard_sectors = drbd_max_discard_sectors(connection);
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+ } else {
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
+ blk_queue_discard_granularity(q, 0);
+ q->limits.max_discard_sectors = 0;
+ }
+}
+
+static void fixup_discard_if_not_supported(struct request_queue *q)
+{
+ /* To avoid confusion, if this queue does not support discard, clear
+ * max_discard_sectors, which is what lsblk -D reports to the user.
+ * Older kernels got this wrong in "stack limits".
+ * */
+ if (!blk_queue_discard(q)) {
+ blk_queue_max_discard_sectors(q, 0);
+ blk_queue_discard_granularity(q, 0);
+ }
+}
+
+static void decide_on_write_same_support(struct drbd_device *device,
+ struct request_queue *q,
+ struct request_queue *b, struct o_qlim *o)
+{
+ struct drbd_peer_device *peer_device = first_peer_device(device);
+ struct drbd_connection *connection = peer_device->connection;
+ bool can_do = b ? b->limits.max_write_same_sectors : true;
+
+ if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_WSAME)) {
+ can_do = false;
+ drbd_info(peer_device, "peer does not support WRITE_SAME\n");
+ }
+
+ if (o) {
+ /* logical block size; queue_logical_block_size(NULL) is 512 */
+ unsigned int peer_lbs = be32_to_cpu(o->logical_block_size);
+ unsigned int me_lbs_b = queue_logical_block_size(b);
+ unsigned int me_lbs = queue_logical_block_size(q);
+
+ if (me_lbs_b != me_lbs) {
+ drbd_warn(device,
+ "logical block size of local backend does not match (drbd:%u, backend:%u); was this a late attach?\n",
+ me_lbs, me_lbs_b);
+ /* rather disable write same than trigger some BUG_ON later in the scsi layer. */
+ can_do = false;
+ }
+ if (me_lbs_b != peer_lbs) {
+ drbd_warn(peer_device, "logical block sizes do not match (me:%u, peer:%u); this may cause problems.\n",
+ me_lbs, peer_lbs);
+ if (can_do) {
+ drbd_dbg(peer_device, "logical block size mismatch: WRITE_SAME disabled.\n");
+ can_do = false;
+ }
+ me_lbs = max(me_lbs, me_lbs_b);
+ /* We cannot change the logical block size of an in-use queue.
+ * We can only hope that access happens to be properly aligned.
+ * If not, the peer will likely produce an IO error, and detach. */
+ if (peer_lbs > me_lbs) {
+ if (device->state.role != R_PRIMARY) {
+ blk_queue_logical_block_size(q, peer_lbs);
+ drbd_warn(peer_device, "logical block size set to %u\n", peer_lbs);
+ } else {
+ drbd_warn(peer_device,
+ "current Primary must NOT adjust logical block size (%u -> %u); hope for the best.\n",
+ me_lbs, peer_lbs);
+ }
+ }
+ }
+ if (can_do && !o->write_same_capable) {
+ /* If we introduce an open-coded write-same loop on the receiving side,
+ * the peer would present itself as "capable". */
+ drbd_dbg(peer_device, "WRITE_SAME disabled (peer device not capable)\n");
+ can_do = false;
+ }
+ }
+
+ blk_queue_max_write_same_sectors(q, can_do ? DRBD_MAX_BBIO_SECTORS : 0);
+}
+
static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
- unsigned int max_bio_size)
+ unsigned int max_bio_size, struct o_qlim *o)
{
struct request_queue * const q = device->rq_queue;
unsigned int max_hw_sectors = max_bio_size >> 9;
unsigned int max_segments = 0;
struct request_queue *b = NULL;
+ struct disk_conf *dc;
+ bool discard_zeroes_if_aligned = true;
if (bdev) {
b = bdev->backing_bdev->bd_disk->queue;
max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
rcu_read_lock();
- max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs;
+ dc = rcu_dereference(device->ldev->disk_conf);
+ max_segments = dc->max_bio_bvecs;
+ discard_zeroes_if_aligned = dc->discard_zeroes_if_aligned;
rcu_read_unlock();
blk_set_stacking_limits(&q->limits);
- blk_queue_max_write_same_sectors(q, 0);
}
- blk_queue_logical_block_size(q, 512);
blk_queue_max_hw_sectors(q, max_hw_sectors);
/* This is the workaround for "bio would need to, but cannot, be split" */
blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
blk_queue_segment_boundary(q, PAGE_SIZE-1);
+ decide_on_discard_support(device, q, b, discard_zeroes_if_aligned);
+ decide_on_write_same_support(device, q, b, o);
if (b) {
- struct drbd_connection *connection = first_peer_device(device)->connection;
-
- blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS);
-
- if (blk_queue_discard(b) &&
- (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) {
- /* We don't care, stacking below should fix it for the local device.
- * Whether or not it is a suitable granularity on the remote device
- * is not our problem, really. If you care, you need to
- * use devices with similar topology on all peers. */
- q->limits.discard_granularity = 512;
- queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
- } else {
- blk_queue_max_discard_sectors(q, 0);
- queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
- q->limits.discard_granularity = 0;
- }
-
blk_queue_stack_limits(q, b);
if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
@@ -1208,15 +1335,10 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
}
}
- /* To avoid confusion, if this queue does not support discard, clear
- * max_discard_sectors, which is what lsblk -D reports to the user. */
- if (!blk_queue_discard(q)) {
- blk_queue_max_discard_sectors(q, 0);
- q->limits.discard_granularity = 0;
- }
+ fixup_discard_if_not_supported(q);
}
-void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev)
+void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o)
{
unsigned int now, new, local, peer;
@@ -1259,7 +1381,7 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backin
if (new != now)
drbd_info(device, "max BIO size = %u\n", new);
- drbd_setup_queue_param(device, bdev, new);
+ drbd_setup_queue_param(device, bdev, new, o);
}
/* Starts the worker thread */
@@ -1348,6 +1470,43 @@ static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b)
a->disk_drain != b->disk_drain;
}
+static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *disk_conf,
+ struct drbd_backing_dev *nbc)
+{
+ struct request_queue * const q = nbc->backing_bdev->bd_disk->queue;
+
+ if (disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+ disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+ if (disk_conf->al_extents > drbd_al_extents_max(nbc))
+ disk_conf->al_extents = drbd_al_extents_max(nbc);
+
+ if (!blk_queue_discard(q)
+ || (!q->limits.discard_zeroes_data && !disk_conf->discard_zeroes_if_aligned)) {
+ if (disk_conf->rs_discard_granularity) {
+ disk_conf->rs_discard_granularity = 0; /* disable feature */
+ drbd_info(device, "rs_discard_granularity feature disabled\n");
+ }
+ }
+
+ if (disk_conf->rs_discard_granularity) {
+ int orig_value = disk_conf->rs_discard_granularity;
+ int remainder;
+
+ if (q->limits.discard_granularity > disk_conf->rs_discard_granularity)
+ disk_conf->rs_discard_granularity = q->limits.discard_granularity;
+
+ remainder = disk_conf->rs_discard_granularity % q->limits.discard_granularity;
+ disk_conf->rs_discard_granularity += remainder;
+
+ if (disk_conf->rs_discard_granularity > q->limits.max_discard_sectors << 9)
+ disk_conf->rs_discard_granularity = q->limits.max_discard_sectors << 9;
+
+ if (disk_conf->rs_discard_granularity != orig_value)
+ drbd_info(device, "rs_discard_granularity changed to %d\n",
+ disk_conf->rs_discard_granularity);
+ }
+}
+
int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
{
struct drbd_config_context adm_ctx;
@@ -1395,10 +1554,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
if (!expect(new_disk_conf->resync_rate >= 1))
new_disk_conf->resync_rate = 1;
- if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
- new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
- if (new_disk_conf->al_extents > drbd_al_extents_max(device->ldev))
- new_disk_conf->al_extents = drbd_al_extents_max(device->ldev);
+ sanitize_disk_conf(device, new_disk_conf, device->ldev);
if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
@@ -1457,6 +1613,9 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
if (write_ordering_changed(old_disk_conf, new_disk_conf))
drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH);
+ if (old_disk_conf->discard_zeroes_if_aligned != new_disk_conf->discard_zeroes_if_aligned)
+ drbd_reconsider_queue_parameters(device, device->ldev, NULL);
+
drbd_md_sync(device);
if (device->state.conn >= C_CONNECTED) {
@@ -1693,10 +1852,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
if (retcode != NO_ERROR)
goto fail;
- if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
- new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
- if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
- new_disk_conf->al_extents = drbd_al_extents_max(nbc);
+ sanitize_disk_conf(device, new_disk_conf, nbc);
if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
drbd_err(device, "max capacity %llu smaller than disk size %llu\n",
@@ -1838,7 +1994,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
device->read_cnt = 0;
device->writ_cnt = 0;
- drbd_reconsider_max_bio_size(device, device->ldev);
+ drbd_reconsider_queue_parameters(device, device->ldev, NULL);
/* If I am currently not R_PRIMARY,
* but meta data primary indicator is set,
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 6537b25db9c1..be2b93fd2c11 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -25,7 +25,7 @@
#include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/proc_fs.h>
@@ -122,18 +122,18 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
x = res/50;
y = 20-x;
- seq_printf(seq, "\t[");
+ seq_puts(seq, "\t[");
for (i = 1; i < x; i++)
- seq_printf(seq, "=");
- seq_printf(seq, ">");
+ seq_putc(seq, '=');
+ seq_putc(seq, '>');
for (i = 0; i < y; i++)
seq_printf(seq, ".");
- seq_printf(seq, "] ");
+ seq_puts(seq, "] ");
if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
- seq_printf(seq, "verified:");
+ seq_puts(seq, "verified:");
else
- seq_printf(seq, "sync'ed:");
+ seq_puts(seq, "sync'ed:");
seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
/* if more than a few GB, display in MB */
@@ -146,7 +146,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
(unsigned long) Bit2KB(rs_left),
(unsigned long) Bit2KB(rs_total));
- seq_printf(seq, "\n\t");
+ seq_puts(seq, "\n\t");
/* see drivers/md/md.c
* We do not want to overflow, so the order of operands and
@@ -175,9 +175,9 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
rt / 3600, (rt % 3600) / 60, rt % 60);
dbdt = Bit2KB(db/dt);
- seq_printf(seq, " speed: ");
+ seq_puts(seq, " speed: ");
seq_printf_with_thousands_grouping(seq, dbdt);
- seq_printf(seq, " (");
+ seq_puts(seq, " (");
/* ------------------------- ~3s average ------------------------ */
if (proc_details >= 1) {
/* this is what drbd_rs_should_slow_down() uses */
@@ -188,7 +188,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
db = device->rs_mark_left[i] - rs_left;
dbdt = Bit2KB(db/dt);
seq_printf_with_thousands_grouping(seq, dbdt);
- seq_printf(seq, " -- ");
+ seq_puts(seq, " -- ");
}
/* --------------------- long term average ---------------------- */
@@ -200,11 +200,11 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
db = rs_total - rs_left;
dbdt = Bit2KB(db/dt);
seq_printf_with_thousands_grouping(seq, dbdt);
- seq_printf(seq, ")");
+ seq_putc(seq, ')');
if (state.conn == C_SYNC_TARGET ||
state.conn == C_VERIFY_S) {
- seq_printf(seq, " want: ");
+ seq_puts(seq, " want: ");
seq_printf_with_thousands_grouping(seq, device->c_sync_rate);
}
seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
@@ -231,7 +231,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se
(unsigned long long)bm_bits * BM_SECT_PER_BIT);
if (stop_sector != 0 && stop_sector != ULLONG_MAX)
seq_printf(seq, " stop sector: %llu", stop_sector);
- seq_printf(seq, "\n");
+ seq_putc(seq, '\n');
}
}
@@ -276,7 +276,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
rcu_read_lock();
idr_for_each_entry(&drbd_devices, device, i) {
if (prev_i != i - 1)
- seq_printf(seq, "\n");
+ seq_putc(seq, '\n');
prev_i = i;
state = device->state;
diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h
index ef9245363dcc..4d296800f706 100644
--- a/drivers/block/drbd/drbd_protocol.h
+++ b/drivers/block/drbd/drbd_protocol.h
@@ -60,6 +60,15 @@ enum drbd_packet {
* which is why I chose TRIM here, to disambiguate. */
P_TRIM = 0x31,
+ /* Only use these two if both support FF_THIN_RESYNC */
+ P_RS_THIN_REQ = 0x32, /* Request a block for resync or reply P_RS_DEALLOCATED */
+ P_RS_DEALLOCATED = 0x33, /* Contains only zeros on sync source node */
+
+ /* REQ_WRITE_SAME.
+ * On a receiving side without REQ_WRITE_SAME,
+ * we may fall back to an opencoded loop instead. */
+ P_WSAME = 0x34,
+
P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
P_MAX_OPT_CMD = 0x101,
@@ -106,16 +115,20 @@ struct p_header100 {
u32 pad;
} __packed;
-/* these defines must not be changed without changing the protocol version */
-#define DP_HARDBARRIER 1 /* depricated */
+/* These defines must not be changed without changing the protocol version.
+ * New defines may only be introduced together with protocol version bump or
+ * new protocol feature flags.
+ */
+#define DP_HARDBARRIER 1 /* no longer used */
#define DP_RW_SYNC 2 /* equals REQ_SYNC */
#define DP_MAY_SET_IN_SYNC 4
#define DP_UNPLUG 8 /* not used anymore */
#define DP_FUA 16 /* equals REQ_FUA */
-#define DP_FLUSH 32 /* equals REQ_FLUSH */
+#define DP_FLUSH 32 /* equals REQ_PREFLUSH */
#define DP_DISCARD 64 /* equals REQ_DISCARD */
#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */
+#define DP_WSAME 512 /* equiv. REQ_WRITE_SAME */
struct p_data {
u64 sector; /* 64 bits sector number */
@@ -129,6 +142,11 @@ struct p_trim {
u32 size; /* == bio->bi_size */
} __packed;
+struct p_wsame {
+ struct p_data p_data;
+ u32 size; /* == bio->bi_size */
+} __packed;
+
/*
* commands which share a struct:
* p_block_ack:
@@ -160,7 +178,23 @@ struct p_block_req {
* ReportParams
*/
-#define FF_TRIM 1
+/* supports TRIM/DISCARD on the "wire" protocol */
+#define DRBD_FF_TRIM 1
+
+/* Detect all-zeros during resync, and rather TRIM/UNMAP/DISCARD those blocks
+ * instead of fully allocate a supposedly thin volume on initial resync */
+#define DRBD_FF_THIN_RESYNC 2
+
+/* supports REQ_WRITE_SAME on the "wire" protocol.
+ * Note: this flag is overloaded,
+ * its presence also
+ * - indicates support for 128 MiB "batch bios",
+ * max discard size of 128 MiB
+ * instead of 4M before that.
+ * - indicates that we exchange additional settings in p_sizes
+ * drbd_send_sizes()/receive_sizes()
+ */
+#define DRBD_FF_WSAME 4
struct p_connection_features {
u32 protocol_min;
@@ -235,6 +269,40 @@ struct p_rs_uuid {
u64 uuid;
} __packed;
+/* optional queue_limits if (agreed_features & DRBD_FF_WSAME)
+ * see also struct queue_limits, as of late 2015 */
+struct o_qlim {
+ /* we don't need it yet, but we may as well communicate it now */
+ u32 physical_block_size;
+
+ /* so the original in struct queue_limits is unsigned short,
+ * but I'd have to put in padding anyways. */
+ u32 logical_block_size;
+
+ /* One incoming bio becomes one DRBD request,
+ * which may be translated to several bio on the receiving side.
+ * We don't need to communicate chunk/boundary/segment ... limits.
+ */
+
+ /* various IO hints may be useful with "diskless client" setups */
+ u32 alignment_offset;
+ u32 io_min;
+ u32 io_opt;
+
+ /* We may need to communicate integrity stuff at some point,
+ * but let's not get ahead of ourselves. */
+
+ /* Backend discard capabilities.
+ * Receiving side uses "blkdev_issue_discard()", no need to communicate
+ * more specifics. If the backend cannot do discards, the DRBD peer
+ * may fall back to blkdev_issue_zeroout().
+ */
+ u8 discard_enabled;
+ u8 discard_zeroes_data;
+ u8 write_same_capable;
+ u8 _pad;
+} __packed;
+
struct p_sizes {
u64 d_size; /* size of disk */
u64 u_size; /* user requested size */
@@ -242,6 +310,9 @@ struct p_sizes {
u32 max_bio_size; /* Maximal size of a BIO */
u16 queue_order_type; /* not yet implemented in DRBD*/
u16 dds_flags; /* use enum dds_flags here. */
+
+ /* optional queue_limits if (agreed_features & DRBD_FF_WSAME) */
+ struct o_qlim qlim[0];
} __packed;
struct p_state {
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 050aaa1c0350..df45713dfbe8 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -25,7 +25,7 @@
#include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <net/sock.h>
#include <linux/drbd.h>
@@ -48,7 +48,7 @@
#include "drbd_req.h"
#include "drbd_vli.h"
-#define PRO_FEATURES (FF_TRIM)
+#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME)
struct packet_info {
enum drbd_packet cmd;
@@ -361,14 +361,17 @@ You must not have the req_lock:
drbd_wait_ee_list_empty()
*/
+/* normal: payload_size == request size (bi_size)
+ * w_same: payload_size == logical_block_size
+ * trim: payload_size == 0 */
struct drbd_peer_request *
drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
- unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local)
+ unsigned int request_size, unsigned int payload_size, gfp_t gfp_mask) __must_hold(local)
{
struct drbd_device *device = peer_device->device;
struct drbd_peer_request *peer_req;
struct page *page = NULL;
- unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
+ unsigned nr_pages = (payload_size + PAGE_SIZE -1) >> PAGE_SHIFT;
if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
return NULL;
@@ -380,7 +383,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
return NULL;
}
- if (has_payload && data_size) {
+ if (nr_pages) {
page = drbd_alloc_pages(peer_device, nr_pages,
gfpflags_allow_blocking(gfp_mask));
if (!page)
@@ -390,7 +393,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
memset(peer_req, 0, sizeof(*peer_req));
INIT_LIST_HEAD(&peer_req->w.list);
drbd_clear_interval(&peer_req->i);
- peer_req->i.size = data_size;
+ peer_req->i.size = request_size;
peer_req->i.sector = sector;
peer_req->submit_jif = jiffies;
peer_req->peer_device = peer_device;
@@ -1204,13 +1207,84 @@ static int drbd_recv_header(struct drbd_connection *connection, struct packet_in
return err;
}
-static void drbd_flush(struct drbd_connection *connection)
+/* This is blkdev_issue_flush, but asynchronous.
+ * We want to submit to all component volumes in parallel,
+ * then wait for all completions.
+ */
+struct issue_flush_context {
+ atomic_t pending;
+ int error;
+ struct completion done;
+};
+struct one_flush_context {
+ struct drbd_device *device;
+ struct issue_flush_context *ctx;
+};
+
+void one_flush_endio(struct bio *bio)
{
- int rv;
- struct drbd_peer_device *peer_device;
- int vnr;
+ struct one_flush_context *octx = bio->bi_private;
+ struct drbd_device *device = octx->device;
+ struct issue_flush_context *ctx = octx->ctx;
+
+ if (bio->bi_error) {
+ ctx->error = bio->bi_error;
+ drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_error);
+ }
+ kfree(octx);
+ bio_put(bio);
+
+ clear_bit(FLUSH_PENDING, &device->flags);
+ put_ldev(device);
+ kref_put(&device->kref, drbd_destroy_device);
+
+ if (atomic_dec_and_test(&ctx->pending))
+ complete(&ctx->done);
+}
+
+static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx)
+{
+ struct bio *bio = bio_alloc(GFP_NOIO, 0);
+ struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO);
+ if (!bio || !octx) {
+ drbd_warn(device, "Could not allocate a bio, CANNOT ISSUE FLUSH\n");
+ /* FIXME: what else can I do now? disconnecting or detaching
+ * really does not help to improve the state of the world, either.
+ */
+ kfree(octx);
+ if (bio)
+ bio_put(bio);
+
+ ctx->error = -ENOMEM;
+ put_ldev(device);
+ kref_put(&device->kref, drbd_destroy_device);
+ return;
+ }
+ octx->device = device;
+ octx->ctx = ctx;
+ bio->bi_bdev = device->ldev->backing_bdev;
+ bio->bi_private = octx;
+ bio->bi_end_io = one_flush_endio;
+ bio_set_op_attrs(bio, REQ_OP_FLUSH, WRITE_FLUSH);
+
+ device->flush_jif = jiffies;
+ set_bit(FLUSH_PENDING, &device->flags);
+ atomic_inc(&ctx->pending);
+ submit_bio(bio);
+}
+
+static void drbd_flush(struct drbd_connection *connection)
+{
if (connection->resource->write_ordering >= WO_BDEV_FLUSH) {
+ struct drbd_peer_device *peer_device;
+ struct issue_flush_context ctx;
+ int vnr;
+
+ atomic_set(&ctx.pending, 1);
+ ctx.error = 0;
+ init_completion(&ctx.done);
+
rcu_read_lock();
idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
struct drbd_device *device = peer_device->device;
@@ -1220,31 +1294,24 @@ static void drbd_flush(struct drbd_connection *connection)
kref_get(&device->kref);
rcu_read_unlock();
- /* Right now, we have only this one synchronous code path
- * for flushes between request epochs.
- * We may want to make those asynchronous,
- * or at least parallelize the flushes to the volume devices.
- */
- device->flush_jif = jiffies;
- set_bit(FLUSH_PENDING, &device->flags);
- rv = blkdev_issue_flush(device->ldev->backing_bdev,
- GFP_NOIO, NULL);
- clear_bit(FLUSH_PENDING, &device->flags);
- if (rv) {
- drbd_info(device, "local disk flush failed with status %d\n", rv);
- /* would rather check on EOPNOTSUPP, but that is not reliable.
- * don't try again for ANY return value != 0
- * if (rv == -EOPNOTSUPP) */
- drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
- }
- put_ldev(device);
- kref_put(&device->kref, drbd_destroy_device);
+ submit_one_flush(device, &ctx);
rcu_read_lock();
- if (rv)
- break;
}
rcu_read_unlock();
+
+ /* Do we want to add a timeout,
+ * if disk-timeout is set? */
+ if (!atomic_dec_and_test(&ctx.pending))
+ wait_for_completion(&ctx.done);
+
+ if (ctx.error) {
+ /* would rather check on EOPNOTSUPP, but that is not reliable.
+ * don't try again for ANY return value != 0
+ * if (rv == -EOPNOTSUPP) */
+ /* Any error is already reported by bio_endio callback. */
+ drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
+ }
}
}
@@ -1379,6 +1446,120 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
}
+/*
+ * We *may* ignore the discard-zeroes-data setting, if so configured.
+ *
+ * Assumption is that it "discard_zeroes_data=0" is only because the backend
+ * may ignore partial unaligned discards.
+ *
+ * LVM/DM thin as of at least
+ * LVM version: 2.02.115(2)-RHEL7 (2015-01-28)
+ * Library version: 1.02.93-RHEL7 (2015-01-28)
+ * Driver version: 4.29.0
+ * still behaves this way.
+ *
+ * For unaligned (wrt. alignment and granularity) or too small discards,
+ * we zero-out the initial (and/or) trailing unaligned partial chunks,
+ * but discard all the aligned full chunks.
+ *
+ * At least for LVM/DM thin, the result is effectively "discard_zeroes_data=1".
+ */
+int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, bool discard)
+{
+ struct block_device *bdev = device->ldev->backing_bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+ sector_t tmp, nr;
+ unsigned int max_discard_sectors, granularity;
+ int alignment;
+ int err = 0;
+
+ if (!discard)
+ goto zero_out;
+
+ /* Zero-sector (unknown) and one-sector granularities are the same. */
+ granularity = max(q->limits.discard_granularity >> 9, 1U);
+ alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
+
+ max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22));
+ max_discard_sectors -= max_discard_sectors % granularity;
+ if (unlikely(!max_discard_sectors))
+ goto zero_out;
+
+ if (nr_sectors < granularity)
+ goto zero_out;
+
+ tmp = start;
+ if (sector_div(tmp, granularity) != alignment) {
+ if (nr_sectors < 2*granularity)
+ goto zero_out;
+ /* start + gran - (start + gran - align) % gran */
+ tmp = start + granularity - alignment;
+ tmp = start + granularity - sector_div(tmp, granularity);
+
+ nr = tmp - start;
+ err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0);
+ nr_sectors -= nr;
+ start = tmp;
+ }
+ while (nr_sectors >= granularity) {
+ nr = min_t(sector_t, nr_sectors, max_discard_sectors);
+ err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0);
+ nr_sectors -= nr;
+ start += nr;
+ }
+ zero_out:
+ if (nr_sectors) {
+ err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO, 0);
+ }
+ return err != 0;
+}
+
+static bool can_do_reliable_discards(struct drbd_device *device)
+{
+ struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
+ struct disk_conf *dc;
+ bool can_do;
+
+ if (!blk_queue_discard(q))
+ return false;
+
+ if (q->limits.discard_zeroes_data)
+ return true;
+
+ rcu_read_lock();
+ dc = rcu_dereference(device->ldev->disk_conf);
+ can_do = dc->discard_zeroes_if_aligned;
+ rcu_read_unlock();
+ return can_do;
+}
+
+static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req)
+{
+ /* If the backend cannot discard, or does not guarantee
+ * read-back zeroes in discarded ranges, we fall back to
+ * zero-out. Unless configuration specifically requested
+ * otherwise. */
+ if (!can_do_reliable_discards(device))
+ peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
+
+ if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector,
+ peer_req->i.size >> 9, !(peer_req->flags & EE_IS_TRIM_USE_ZEROOUT)))
+ peer_req->flags |= EE_WAS_ERROR;
+ drbd_endio_write_sec_final(peer_req);
+}
+
+static void drbd_issue_peer_wsame(struct drbd_device *device,
+ struct drbd_peer_request *peer_req)
+{
+ struct block_device *bdev = device->ldev->backing_bdev;
+ sector_t s = peer_req->i.sector;
+ sector_t nr = peer_req->i.size >> 9;
+ if (blkdev_issue_write_same(bdev, s, nr, GFP_NOIO, peer_req->pages))
+ peer_req->flags |= EE_WAS_ERROR;
+ drbd_endio_write_sec_final(peer_req);
+}
+
+
/**
* drbd_submit_peer_request()
* @device: DRBD device.
@@ -1398,7 +1579,8 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
/* TODO allocate from our own bio_set. */
int drbd_submit_peer_request(struct drbd_device *device,
struct drbd_peer_request *peer_req,
- const unsigned rw, const int fault_type)
+ const unsigned op, const unsigned op_flags,
+ const int fault_type)
{
struct bio *bios = NULL;
struct bio *bio;
@@ -1409,7 +1591,13 @@ int drbd_submit_peer_request(struct drbd_device *device,
unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
int err = -ENOMEM;
- if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
+ /* TRIM/DISCARD: for now, always use the helper function
+ * blkdev_issue_zeroout(..., discard=true).
+ * It's synchronous, but it does the right thing wrt. bio splitting.
+ * Correctness first, performance later. Next step is to code an
+ * asynchronous variant of the same.
+ */
+ if (peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) {
/* wait for all pending IO completions, before we start
* zeroing things out. */
conn_wait_active_ee_empty(peer_req->peer_device->connection);
@@ -1417,22 +1605,22 @@ int drbd_submit_peer_request(struct drbd_device *device,
* so we can find it to present it in debugfs */
peer_req->submit_jif = jiffies;
peer_req->flags |= EE_SUBMITTED;
- spin_lock_irq(&device->resource->req_lock);
- list_add_tail(&peer_req->w.list, &device->active_ee);
- spin_unlock_irq(&device->resource->req_lock);
- if (blkdev_issue_zeroout(device->ldev->backing_bdev,
- sector, data_size >> 9, GFP_NOIO, false))
- peer_req->flags |= EE_WAS_ERROR;
- drbd_endio_write_sec_final(peer_req);
+
+ /* If this was a resync request from receive_rs_deallocated(),
+ * it is already on the sync_ee list */
+ if (list_empty(&peer_req->w.list)) {
+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&peer_req->w.list, &device->active_ee);
+ spin_unlock_irq(&device->resource->req_lock);
+ }
+
+ if (peer_req->flags & EE_IS_TRIM)
+ drbd_issue_peer_discard(device, peer_req);
+ else /* EE_WRITE_SAME */
+ drbd_issue_peer_wsame(device, peer_req);
return 0;
}
- /* Discards don't have any payload.
- * But the scsi layer still expects a bio_vec it can use internally,
- * see sd_setup_discard_cmnd() and blk_add_request_payload(). */
- if (peer_req->flags & EE_IS_TRIM)
- nr_pages = 1;
-
/* In most cases, we will only need one bio. But in case the lower
* level restrictions happen to be different at this offset on this
* side than those of the sending peer, we may need to submit the
@@ -1450,7 +1638,7 @@ next_bio:
/* > peer_req->i.sector, unless this is the first bio */
bio->bi_iter.bi_sector = sector;
bio->bi_bdev = device->ldev->backing_bdev;
- bio->bi_rw = rw;
+ bio_set_op_attrs(bio, op, op_flags);
bio->bi_private = peer_req;
bio->bi_end_io = drbd_peer_request_endio;
@@ -1458,11 +1646,6 @@ next_bio:
bios = bio;
++n_bios;
- if (rw & REQ_DISCARD) {
- bio->bi_iter.bi_size = data_size;
- goto submit;
- }
-
page_chain_for_each(page) {
unsigned len = min_t(unsigned, data_size, PAGE_SIZE);
if (!bio_add_page(bio, page, len, 0)) {
@@ -1484,7 +1667,6 @@ next_bio:
--nr_pages;
}
D_ASSERT(device, data_size == 0);
-submit:
D_ASSERT(device, page == NULL);
atomic_set(&peer_req->pending_bios, n_bios);
@@ -1608,8 +1790,26 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
return 0;
}
+/* quick wrapper in case payload size != request_size (write same) */
+static void drbd_csum_ee_size(struct crypto_ahash *h,
+ struct drbd_peer_request *r, void *d,
+ unsigned int payload_size)
+{
+ unsigned int tmp = r->i.size;
+ r->i.size = payload_size;
+ drbd_csum_ee(h, r, d);
+ r->i.size = tmp;
+}
+
/* used from receive_RSDataReply (recv_resync_read)
- * and from receive_Data */
+ * and from receive_Data.
+ * data_size: actual payload ("data in")
+ * for normal writes that is bi_size.
+ * for discards, that is zero.
+ * for write same, it is logical_block_size.
+ * both trim and write same have the bi_size ("data len to be affected")
+ * as extra argument in the packet header.
+ */
static struct drbd_peer_request *
read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
struct packet_info *pi) __must_hold(local)
@@ -1624,6 +1824,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
void *dig_vv = peer_device->connection->int_dig_vv;
unsigned long *data;
struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
+ struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL;
digest_size = 0;
if (!trim && peer_device->connection->peer_integrity_tfm) {
@@ -1638,38 +1839,60 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
data_size -= digest_size;
}
+ /* assume request_size == data_size, but special case trim and wsame. */
+ ds = data_size;
if (trim) {
- D_ASSERT(peer_device, data_size == 0);
- data_size = be32_to_cpu(trim->size);
+ if (!expect(data_size == 0))
+ return NULL;
+ ds = be32_to_cpu(trim->size);
+ } else if (wsame) {
+ if (data_size != queue_logical_block_size(device->rq_queue)) {
+ drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n",
+ data_size, queue_logical_block_size(device->rq_queue));
+ return NULL;
+ }
+ if (data_size != bdev_logical_block_size(device->ldev->backing_bdev)) {
+ drbd_err(peer_device, "data size (%u) != backend logical block size (%u)\n",
+ data_size, bdev_logical_block_size(device->ldev->backing_bdev));
+ return NULL;
+ }
+ ds = be32_to_cpu(wsame->size);
}
- if (!expect(IS_ALIGNED(data_size, 512)))
+ if (!expect(IS_ALIGNED(ds, 512)))
return NULL;
- /* prepare for larger trim requests. */
- if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE))
+ if (trim || wsame) {
+ if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
+ return NULL;
+ } else if (!expect(ds <= DRBD_MAX_BIO_SIZE))
return NULL;
/* even though we trust out peer,
* we sometimes have to double check. */
- if (sector + (data_size>>9) > capacity) {
+ if (sector + (ds>>9) > capacity) {
drbd_err(device, "request from peer beyond end of local disk: "
"capacity: %llus < sector: %llus + size: %u\n",
(unsigned long long)capacity,
- (unsigned long long)sector, data_size);
+ (unsigned long long)sector, ds);
return NULL;
}
/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
- peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO);
+ peer_req = drbd_alloc_peer_req(peer_device, id, sector, ds, data_size, GFP_NOIO);
if (!peer_req)
return NULL;
peer_req->flags |= EE_WRITE;
- if (trim)
+ if (trim) {
+ peer_req->flags |= EE_IS_TRIM;
return peer_req;
+ }
+ if (wsame)
+ peer_req->flags |= EE_WRITE_SAME;
+ /* receive payload size bytes into page chain */
ds = data_size;
page = peer_req->pages;
page_chain_for_each(page) {
@@ -1689,7 +1912,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
}
if (digest_size) {
- drbd_csum_ee(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv);
+ drbd_csum_ee_size(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv, data_size);
if (memcmp(dig_in, dig_vv, digest_size)) {
drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
(unsigned long long)sector, data_size);
@@ -1830,7 +2053,8 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto
spin_unlock_irq(&device->resource->req_lock);
atomic_add(pi->size >> 9, &device->rs_sect_ev);
- if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
+ if (drbd_submit_peer_request(device, peer_req, REQ_OP_WRITE, 0,
+ DRBD_FAULT_RS_WR) == 0)
return 0;
/* don't care for the reason here */
@@ -2065,13 +2289,13 @@ static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
{
struct drbd_peer_request *rs_req;
- bool rv = 0;
+ bool rv = false;
spin_lock_irq(&device->resource->req_lock);
list_for_each_entry(rs_req, &device->sync_ee, w.list) {
if (overlaps(peer_req->i.sector, peer_req->i.size,
rs_req->i.sector, rs_req->i.size)) {
- rv = 1;
+ rv = true;
break;
}
}
@@ -2152,12 +2376,19 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co
/* see also bio_flags_to_wire()
* DRBD_REQ_*, because we need to semantically map the flags to data packet
* flags and back. We may replicate to other kernel versions. */
-static unsigned long wire_flags_to_bio(u32 dpf)
+static unsigned long wire_flags_to_bio_flags(u32 dpf)
{
return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
(dpf & DP_FUA ? REQ_FUA : 0) |
- (dpf & DP_FLUSH ? REQ_FLUSH : 0) |
- (dpf & DP_DISCARD ? REQ_DISCARD : 0);
+ (dpf & DP_FLUSH ? REQ_PREFLUSH : 0);
+}
+
+static unsigned long wire_flags_to_bio_op(u32 dpf)
+{
+ if (dpf & DP_DISCARD)
+ return REQ_OP_DISCARD;
+ else
+ return REQ_OP_WRITE;
}
static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
@@ -2303,7 +2534,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
struct drbd_peer_request *peer_req;
struct p_data *p = pi->data;
u32 peer_seq = be32_to_cpu(p->seq_num);
- int rw = WRITE;
+ int op, op_flags;
u32 dp_flags;
int err, tp;
@@ -2342,14 +2573,11 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
peer_req->flags |= EE_APPLICATION;
dp_flags = be32_to_cpu(p->dp_flags);
- rw |= wire_flags_to_bio(dp_flags);
+ op = wire_flags_to_bio_op(dp_flags);
+ op_flags = wire_flags_to_bio_flags(dp_flags);
if (pi->cmd == P_TRIM) {
- struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
- peer_req->flags |= EE_IS_TRIM;
- if (!blk_queue_discard(q))
- peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
D_ASSERT(peer_device, peer_req->i.size > 0);
- D_ASSERT(peer_device, rw & REQ_DISCARD);
+ D_ASSERT(peer_device, op == REQ_OP_DISCARD);
D_ASSERT(peer_device, peer_req->pages == NULL);
} else if (peer_req->pages == NULL) {
D_ASSERT(device, peer_req->i.size == 0);
@@ -2414,11 +2642,11 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
update_peer_seq(peer_device, peer_seq);
spin_lock_irq(&device->resource->req_lock);
}
- /* if we use the zeroout fallback code, we process synchronously
- * and we wait for all pending requests, respectively wait for
+ /* TRIM and WRITE_SAME are processed synchronously,
+ * we wait for all pending requests, respectively wait for
* active_ee to become empty in drbd_submit_peer_request();
* better not add ourselves here. */
- if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0)
+ if ((peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) == 0)
list_add_tail(&peer_req->w.list, &device->active_ee);
spin_unlock_irq(&device->resource->req_lock);
@@ -2433,7 +2661,8 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
}
- err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR);
+ err = drbd_submit_peer_request(device, peer_req, op, op_flags,
+ DRBD_FAULT_DT_WR);
if (!err)
return 0;
@@ -2449,7 +2678,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
}
out_interrupted:
- drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP);
+ drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT | EV_CLEANUP);
put_ldev(device);
drbd_free_peer_req(device, peer_req);
return err;
@@ -2574,6 +2803,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
case P_DATA_REQUEST:
drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
break;
+ case P_RS_THIN_REQ:
case P_RS_DATA_REQUEST:
case P_CSUM_RS_REQUEST:
case P_OV_REQUEST:
@@ -2599,7 +2829,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
- true /* has real payload */, GFP_NOIO);
+ size, GFP_NOIO);
if (!peer_req) {
put_ldev(device);
return -ENOMEM;
@@ -2613,6 +2843,12 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
peer_req->flags |= EE_APPLICATION;
goto submit;
+ case P_RS_THIN_REQ:
+ /* If at some point in the future we have a smart way to
+ find out if this data block is completely deallocated,
+ then we would do something smarter here than reading
+ the block... */
+ peer_req->flags |= EE_RS_THIN_REQ;
case P_RS_DATA_REQUEST:
peer_req->w.cb = w_e_end_rsdata_req;
fault_type = DRBD_FAULT_RS_RD;
@@ -2723,7 +2959,8 @@ submit_for_resync:
submit:
update_receiver_timing_details(connection, drbd_submit_peer_request);
inc_unacked(device);
- if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0)
+ if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, 0,
+ fault_type) == 0)
return 0;
/* don't care for the reason here */
@@ -2957,7 +3194,8 @@ static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
-1091 requires proto 91
-1096 requires proto 96
*/
-static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local)
+
+static int drbd_uuid_compare(struct drbd_device *const device, enum drbd_role const peer_role, int *rule_nr) __must_hold(local)
{
struct drbd_peer_device *const peer_device = first_peer_device(device);
struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
@@ -3037,8 +3275,39 @@ static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __m
* next bit (weight 2) is set when peer was primary */
*rule_nr = 40;
+ /* Neither has the "crashed primary" flag set,
+ * only a replication link hickup. */
+ if (rct == 0)
+ return 0;
+
+ /* Current UUID equal and no bitmap uuid; does not necessarily
+ * mean this was a "simultaneous hard crash", maybe IO was
+ * frozen, so no UUID-bump happened.
+ * This is a protocol change, overload DRBD_FF_WSAME as flag
+ * for "new-enough" peer DRBD version. */
+ if (device->state.role == R_PRIMARY || peer_role == R_PRIMARY) {
+ *rule_nr = 41;
+ if (!(connection->agreed_features & DRBD_FF_WSAME)) {
+ drbd_warn(peer_device, "Equivalent unrotated UUIDs, but current primary present.\n");
+ return -(0x10000 | PRO_VERSION_MAX | (DRBD_FF_WSAME << 8));
+ }
+ if (device->state.role == R_PRIMARY && peer_role == R_PRIMARY) {
+ /* At least one has the "crashed primary" bit set,
+ * both are primary now, but neither has rotated its UUIDs?
+ * "Can not happen." */
+ drbd_err(peer_device, "Equivalent unrotated UUIDs, but both are primary. Can not resolve this.\n");
+ return -100;
+ }
+ if (device->state.role == R_PRIMARY)
+ return 1;
+ return -1;
+ }
+
+ /* Both are secondary.
+ * Really looks like recovery from simultaneous hard crash.
+ * Check which had been primary before, and arbitrate. */
switch (rct) {
- case 0: /* !self_pri && !peer_pri */ return 0;
+ case 0: /* !self_pri && !peer_pri */ return 0; /* already handled */
case 1: /* self_pri && !peer_pri */ return 1;
case 2: /* !self_pri && peer_pri */ return -1;
case 3: /* self_pri && peer_pri */
@@ -3165,7 +3434,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
drbd_uuid_dump(device, "peer", device->p_uuid,
device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
- hg = drbd_uuid_compare(device, &rule_nr);
+ hg = drbd_uuid_compare(device, peer_role, &rule_nr);
spin_unlock_irq(&device->ldev->md.uuid_lock);
drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
@@ -3174,6 +3443,15 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
drbd_alert(device, "Unrelated data, aborting!\n");
return C_MASK;
}
+ if (hg < -0x10000) {
+ int proto, fflags;
+ hg = -hg;
+ proto = hg & 0xff;
+ fflags = (hg >> 8) & 0xff;
+ drbd_alert(device, "To resolve this both sides have to support at least protocol %d and feature flags 0x%x\n",
+ proto, fflags);
+ return C_MASK;
+ }
if (hg < -1000) {
drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
return C_MASK;
@@ -3403,7 +3681,8 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in
*/
peer_integrity_tfm = crypto_alloc_ahash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
- if (!peer_integrity_tfm) {
+ if (IS_ERR(peer_integrity_tfm)) {
+ peer_integrity_tfm = NULL;
drbd_err(connection, "peer data-integrity-alg %s not supported\n",
integrity_alg);
goto disconnect;
@@ -3754,6 +4033,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
struct drbd_peer_device *peer_device;
struct drbd_device *device;
struct p_sizes *p = pi->data;
+ struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL;
enum determine_dev_size dd = DS_UNCHANGED;
sector_t p_size, p_usize, p_csize, my_usize;
int ldsc = 0; /* local disk size changed */
@@ -3773,6 +4053,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
device->p_size = p_size;
if (get_ldev(device)) {
+ sector_t new_size, cur_size;
rcu_read_lock();
my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
rcu_read_unlock();
@@ -3789,11 +4070,13 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
/* Never shrink a device with usable data during connect.
But allow online shrinking if we are connected. */
- if (drbd_new_dev_size(device, device->ldev, p_usize, 0) <
- drbd_get_capacity(device->this_bdev) &&
+ new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0);
+ cur_size = drbd_get_capacity(device->this_bdev);
+ if (new_size < cur_size &&
device->state.disk >= D_OUTDATED &&
device->state.conn < C_CONNECTED) {
- drbd_err(device, "The peer's disk size is too small!\n");
+ drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n",
+ (unsigned long long)new_size, (unsigned long long)cur_size);
conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
put_ldev(device);
return -EIO;
@@ -3827,14 +4110,14 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
}
device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
- /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size().
+ /* Leave drbd_reconsider_queue_parameters() before drbd_determine_dev_size().
In case we cleared the QUEUE_FLAG_DISCARD from our queue in
- drbd_reconsider_max_bio_size(), we can be sure that after
+ drbd_reconsider_queue_parameters(), we can be sure that after
drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
ddsf = be16_to_cpu(p->dds_flags);
if (get_ldev(device)) {
- drbd_reconsider_max_bio_size(device, device->ldev);
+ drbd_reconsider_queue_parameters(device, device->ldev, o);
dd = drbd_determine_dev_size(device, ddsf, NULL);
put_ldev(device);
if (dd == DS_ERROR)
@@ -3854,7 +4137,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
* However, if he sends a zero current size,
* take his (user-capped or) backing disk size anyways.
*/
- drbd_reconsider_max_bio_size(device, NULL);
+ drbd_reconsider_queue_parameters(device, NULL, o);
drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size);
}
@@ -4587,9 +4870,75 @@ static int receive_out_of_sync(struct drbd_connection *connection, struct packet
return 0;
}
+static int receive_rs_deallocated(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct p_block_desc *p = pi->data;
+ struct drbd_device *device;
+ sector_t sector;
+ int size, err = 0;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ sector = be64_to_cpu(p->sector);
+ size = be32_to_cpu(p->blksize);
+
+ dec_rs_pending(device);
+
+ if (get_ldev(device)) {
+ struct drbd_peer_request *peer_req;
+ const int op = REQ_OP_DISCARD;
+
+ peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
+ size, 0, GFP_NOIO);
+ if (!peer_req) {
+ put_ldev(device);
+ return -ENOMEM;
+ }
+
+ peer_req->w.cb = e_end_resync_block;
+ peer_req->submit_jif = jiffies;
+ peer_req->flags |= EE_IS_TRIM;
+
+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&peer_req->w.list, &device->sync_ee);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ atomic_add(pi->size >> 9, &device->rs_sect_ev);
+ err = drbd_submit_peer_request(device, peer_req, op, 0, DRBD_FAULT_RS_WR);
+
+ if (err) {
+ spin_lock_irq(&device->resource->req_lock);
+ list_del(&peer_req->w.list);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ drbd_free_peer_req(device, peer_req);
+ put_ldev(device);
+ err = 0;
+ goto fail;
+ }
+
+ inc_unacked(device);
+
+ /* No put_ldev() here. Gets called in drbd_endio_write_sec_final(),
+ as well as drbd_rs_complete_io() */
+ } else {
+ fail:
+ drbd_rs_complete_io(device, sector);
+ drbd_send_ack_ex(peer_device, P_NEG_ACK, sector, size, ID_SYNCER);
+ }
+
+ atomic_add(size >> 9, &device->rs_sect_in);
+
+ return err;
+}
+
struct data_cmd {
int expect_payload;
- size_t pkt_size;
+ unsigned int pkt_size;
int (*fn)(struct drbd_connection *, struct packet_info *);
};
@@ -4614,11 +4963,14 @@ static struct data_cmd drbd_cmd_handler[] = {
[P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
[P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
[P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
+ [P_RS_THIN_REQ] = { 0, sizeof(struct p_block_req), receive_DataRequest },
[P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
[P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
[P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
[P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
[P_TRIM] = { 0, sizeof(struct p_trim), receive_Data },
+ [P_RS_DEALLOCATED] = { 0, sizeof(struct p_block_desc), receive_rs_deallocated },
+ [P_WSAME] = { 1, sizeof(struct p_wsame), receive_Data },
};
static void drbdd(struct drbd_connection *connection)
@@ -4628,7 +4980,7 @@ static void drbdd(struct drbd_connection *connection)
int err;
while (get_t_state(&connection->receiver) == RUNNING) {
- struct data_cmd *cmd;
+ struct data_cmd const *cmd;
drbd_thread_current_set_cpu(&connection->receiver);
update_receiver_timing_details(connection, drbd_recv_header);
@@ -4643,11 +4995,18 @@ static void drbdd(struct drbd_connection *connection)
}
shs = cmd->pkt_size;
+ if (pi.cmd == P_SIZES && connection->agreed_features & DRBD_FF_WSAME)
+ shs += sizeof(struct o_qlim);
if (pi.size > shs && !cmd->expect_payload) {
drbd_err(connection, "No payload expected %s l:%d\n",
cmdname(pi.cmd), pi.size);
goto err_out;
}
+ if (pi.size < shs) {
+ drbd_err(connection, "%s: unexpected packet size, expected:%d received:%d\n",
+ cmdname(pi.cmd), (int)shs, pi.size);
+ goto err_out;
+ }
if (shs) {
update_receiver_timing_details(connection, drbd_recv_all_warn);
@@ -4783,9 +5142,11 @@ static int drbd_disconnected(struct drbd_peer_device *peer_device)
drbd_md_sync(device);
- /* serialize with bitmap writeout triggered by the state change,
- * if any. */
- wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+ if (get_ldev(device)) {
+ drbd_bitmap_io(device, &drbd_bm_write_copy_pages,
+ "write from disconnected", BM_LOCKED_CHANGE_ALLOWED);
+ put_ldev(device);
+ }
/* tcp_close and release of sendpage pages can be deferred. I don't
* want to use SO_LINGER, because apparently it can be deferred for
@@ -4892,8 +5253,12 @@ static int drbd_do_features(struct drbd_connection *connection)
drbd_info(connection, "Handshake successful: "
"Agreed network protocol version %d\n", connection->agreed_pro_version);
- drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n",
- connection->agreed_features & FF_TRIM ? " " : " not ");
+ drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s.\n",
+ connection->agreed_features,
+ connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "",
+ connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "",
+ connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" :
+ connection->agreed_features ? "" : " none");
return 1;
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 2255dcfebd2b..66b8e4bb74d8 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -47,8 +47,7 @@ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *r
&device->vdisk->part0, req->start_jif);
}
-static struct drbd_request *drbd_req_new(struct drbd_device *device,
- struct bio *bio_src)
+static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio *bio_src)
{
struct drbd_request *req;
@@ -58,10 +57,12 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
memset(req, 0, sizeof(*req));
drbd_req_make_private_bio(req, bio_src);
- req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
- req->device = device;
- req->master_bio = bio_src;
- req->epoch = 0;
+ req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0)
+ | (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0)
+ | (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0);
+ req->device = device;
+ req->master_bio = bio_src;
+ req->epoch = 0;
drbd_clear_interval(&req->i);
req->i.sector = bio_src->bi_iter.bi_sector;
@@ -218,7 +219,6 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
{
const unsigned s = req->rq_state;
struct drbd_device *device = req->device;
- int rw;
int error, ok;
/* we must not complete the master bio, while it is
@@ -242,8 +242,6 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
return;
}
- rw = bio_rw(req->master_bio);
-
/*
* figure out whether to report success or failure.
*
@@ -267,7 +265,7 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
* epoch number. If they match, increase the current_tle_nr,
* and reset the transfer log epoch write_cnt.
*/
- if (rw == WRITE &&
+ if (op_is_write(bio_op(req->master_bio)) &&
req->epoch == atomic_read(&first_peer_device(device)->connection->current_tle_nr))
start_new_tl_epoch(first_peer_device(device)->connection);
@@ -284,11 +282,14 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
* because no path was available, in which case
* it was not even added to the transfer_log.
*
- * READA may fail, and will not be retried.
+ * read-ahead may fail, and will not be retried.
*
* WRITE should have used all available paths already.
*/
- if (!ok && rw == READ && !list_empty(&req->tl_requests))
+ if (!ok &&
+ bio_op(req->master_bio) == REQ_OP_READ &&
+ !(req->master_bio->bi_rw & REQ_RAHEAD) &&
+ !list_empty(&req->tl_requests))
req->rq_state |= RQ_POSTPONED;
if (!(req->rq_state & RQ_POSTPONED)) {
@@ -644,7 +645,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
__drbd_chk_io_error(device, DRBD_READ_ERROR);
/* fall through. */
case READ_AHEAD_COMPLETED_WITH_ERROR:
- /* it is legal to fail READA, no __drbd_chk_io_error in that case. */
+ /* it is legal to fail read-ahead, no __drbd_chk_io_error in that case. */
mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
break;
@@ -656,7 +657,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
break;
case QUEUE_FOR_NET_READ:
- /* READ or READA, and
+ /* READ, and
* no local disk,
* or target area marked as invalid,
* or just got an io-error. */
@@ -977,16 +978,20 @@ static void complete_conflicting_writes(struct drbd_request *req)
sector_t sector = req->i.sector;
int size = req->i.size;
- i = drbd_find_overlap(&device->write_requests, sector, size);
- if (!i)
- return;
-
for (;;) {
- prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
- i = drbd_find_overlap(&device->write_requests, sector, size);
- if (!i)
+ drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+ /* Ignore, if already completed to upper layers. */
+ if (i->completed)
+ continue;
+ /* Handle the first found overlap. After the schedule
+ * we have to restart the tree walk. */
+ break;
+ }
+ if (!i) /* if any */
break;
+
/* Indicate to wake up device->misc_wait on progress. */
+ prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
i->waiting = true;
spin_unlock_irq(&device->resource->req_lock);
schedule();
@@ -995,7 +1000,7 @@ static void complete_conflicting_writes(struct drbd_request *req)
finish_wait(&device->misc_wait, &wait);
}
-/* called within req_lock and rcu_read_lock() */
+/* called within req_lock */
static void maybe_pull_ahead(struct drbd_device *device)
{
struct drbd_connection *connection = first_peer_device(device)->connection;
@@ -1132,7 +1137,7 @@ static int drbd_process_write_request(struct drbd_request *req)
* replicating, in which case there is no point. */
if (unlikely(req->i.size == 0)) {
/* The only size==0 bios we expect are empty flushes. */
- D_ASSERT(device, req->master_bio->bi_rw & REQ_FLUSH);
+ D_ASSERT(device, req->master_bio->bi_rw & REQ_PREFLUSH);
if (remote)
_req_mod(req, QUEUE_AS_DRBD_BARRIER);
return remote;
@@ -1152,12 +1157,29 @@ static int drbd_process_write_request(struct drbd_request *req)
return remote;
}
+static void drbd_process_discard_req(struct drbd_request *req)
+{
+ int err = drbd_issue_discard_or_zero_out(req->device,
+ req->i.sector, req->i.size >> 9, true);
+
+ if (err)
+ req->private_bio->bi_error = -EIO;
+ bio_endio(req->private_bio);
+}
+
static void
drbd_submit_req_private_bio(struct drbd_request *req)
{
struct drbd_device *device = req->device;
struct bio *bio = req->private_bio;
- const int rw = bio_rw(bio);
+ unsigned int type;
+
+ if (bio_op(bio) != REQ_OP_READ)
+ type = DRBD_FAULT_DT_WR;
+ else if (bio->bi_rw & REQ_RAHEAD)
+ type = DRBD_FAULT_DT_RA;
+ else
+ type = DRBD_FAULT_DT_RD;
bio->bi_bdev = device->ldev->backing_bdev;
@@ -1167,11 +1189,10 @@ drbd_submit_req_private_bio(struct drbd_request *req)
* stable storage, and this is a WRITE, we may not even submit
* this bio. */
if (get_ldev(device)) {
- if (drbd_insert_fault(device,
- rw == WRITE ? DRBD_FAULT_DT_WR
- : rw == READ ? DRBD_FAULT_DT_RD
- : DRBD_FAULT_DT_RA))
+ if (drbd_insert_fault(device, type))
bio_io_error(bio);
+ else if (bio_op(bio) == REQ_OP_DISCARD)
+ drbd_process_discard_req(req);
else
generic_make_request(bio);
put_ldev(device);
@@ -1223,24 +1244,45 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
/* Update disk stats */
_drbd_start_io_acct(device, req);
+ /* process discards always from our submitter thread */
+ if (bio_op(bio) & REQ_OP_DISCARD)
+ goto queue_for_submitter_thread;
+
if (rw == WRITE && req->private_bio && req->i.size
&& !test_bit(AL_SUSPENDED, &device->flags)) {
- if (!drbd_al_begin_io_fastpath(device, &req->i)) {
- atomic_inc(&device->ap_actlog_cnt);
- drbd_queue_write(device, req);
- return NULL;
- }
+ if (!drbd_al_begin_io_fastpath(device, &req->i))
+ goto queue_for_submitter_thread;
req->rq_state |= RQ_IN_ACT_LOG;
req->in_actlog_jif = jiffies;
}
-
return req;
+
+ queue_for_submitter_thread:
+ atomic_inc(&device->ap_actlog_cnt);
+ drbd_queue_write(device, req);
+ return NULL;
+}
+
+/* Require at least one path to current data.
+ * We don't want to allow writes on C_STANDALONE D_INCONSISTENT:
+ * We would not allow to read what was written,
+ * we would not have bumped the data generation uuids,
+ * we would cause data divergence for all the wrong reasons.
+ *
+ * If we don't see at least one D_UP_TO_DATE, we will fail this request,
+ * which either returns EIO, or, if OND_SUSPEND_IO is set, suspends IO,
+ * and queues for retry later.
+ */
+static bool may_do_writes(struct drbd_device *device)
+{
+ const union drbd_dev_state s = device->state;
+ return s.disk == D_UP_TO_DATE || s.pdsk == D_UP_TO_DATE;
}
static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req)
{
struct drbd_resource *resource = device->resource;
- const int rw = bio_rw(req->master_bio);
+ const int rw = bio_data_dir(req->master_bio);
struct bio_and_error m = { NULL, };
bool no_remote = false;
bool submit_private_bio = false;
@@ -1270,7 +1312,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
goto out;
}
- /* We fail READ/READA early, if we can not serve it.
+ /* We fail READ early, if we can not serve it.
* We must do this before req is registered on any lists.
* Otherwise, drbd_req_complete() will queue failed READ for retry. */
if (rw != WRITE) {
@@ -1291,6 +1333,12 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
}
if (rw == WRITE) {
+ if (req->private_bio && !may_do_writes(device)) {
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ put_ldev(device);
+ goto nodata;
+ }
if (!drbd_process_write_request(req))
no_remote = true;
} else {
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index bb2ef78165e5..eb49e7f2da91 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -206,6 +206,8 @@ enum drbd_req_state_bits {
/* Set when this is a write, clear for a read */
__RQ_WRITE,
+ __RQ_WSAME,
+ __RQ_UNMAP,
/* Should call drbd_al_complete_io() for this request... */
__RQ_IN_ACT_LOG,
@@ -241,10 +243,11 @@ enum drbd_req_state_bits {
#define RQ_NET_OK (1UL << __RQ_NET_OK)
#define RQ_NET_SIS (1UL << __RQ_NET_SIS)
-/* 0x1f8 */
#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
#define RQ_WRITE (1UL << __RQ_WRITE)
+#define RQ_WSAME (1UL << __RQ_WSAME)
+#define RQ_UNMAP (1UL << __RQ_UNMAP)
#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
#define RQ_POSTPONED (1UL << __RQ_POSTPONED)
#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP)
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 5a7ef7873b67..eea0c4aec978 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -814,7 +814,7 @@ is_valid_state(struct drbd_device *device, union drbd_state ns)
}
if (rv <= 0)
- /* already found a reason to abort */;
+ goto out; /* already found a reason to abort */
else if (ns.role == R_SECONDARY && device->open_cnt)
rv = SS_DEVICE_IN_USE;
@@ -862,6 +862,7 @@ is_valid_state(struct drbd_device *device, union drbd_state ns)
else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
rv = SS_CONNECTED_OUTDATES;
+out:
rcu_read_unlock();
return rv;
@@ -906,6 +907,15 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_c
(ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS)))
rv = SS_IN_TRANSIENT_STATE;
+ /* Do not promote during resync handshake triggered by "force primary".
+ * This is a hack. It should really be rejected by the peer during the
+ * cluster wide state change request. */
+ if (os.role != R_PRIMARY && ns.role == R_PRIMARY
+ && ns.pdsk == D_UP_TO_DATE
+ && ns.disk != D_UP_TO_DATE && ns.disk != D_DISKLESS
+ && (ns.conn <= C_WF_SYNC_UUID || ns.conn != os.conn))
+ rv = SS_IN_TRANSIENT_STATE;
+
if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
rv = SS_NEED_CONNECTION;
@@ -1628,6 +1638,26 @@ static void broadcast_state_change(struct drbd_state_change *state_change)
#undef REMEMBER_STATE_CHANGE
}
+/* takes old and new peer disk state */
+static bool lost_contact_to_peer_data(enum drbd_disk_state os, enum drbd_disk_state ns)
+{
+ if ((os >= D_INCONSISTENT && os != D_UNKNOWN && os != D_OUTDATED)
+ && (ns < D_INCONSISTENT || ns == D_UNKNOWN || ns == D_OUTDATED))
+ return true;
+
+ /* Scenario, starting with normal operation
+ * Connected Primary/Secondary UpToDate/UpToDate
+ * NetworkFailure Primary/Unknown UpToDate/DUnknown (frozen)
+ * ...
+ * Connected Primary/Secondary UpToDate/Diskless (resumed; needs to bump uuid!)
+ */
+ if (os == D_UNKNOWN
+ && (ns == D_DISKLESS || ns == D_FAILED || ns == D_OUTDATED))
+ return true;
+
+ return false;
+}
+
/**
* after_state_ch() - Perform after state change actions that may sleep
* @device: DRBD device.
@@ -1675,7 +1705,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
what = RESEND;
if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
- conn_lowest_disk(connection) > D_NEGOTIATING)
+ conn_lowest_disk(connection) == D_UP_TO_DATE)
what = RESTART_FROZEN_DISK_IO;
if (resource->susp_nod && what != NOTHING) {
@@ -1699,6 +1729,13 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
clear_bit(NEW_CUR_UUID, &peer_device->device->flags);
rcu_read_unlock();
+
+ /* We should actively create a new uuid, _before_
+ * we resume/resent, if the peer is diskless
+ * (recovery from a multiple error scenario).
+ * Currently, this happens with a slight delay
+ * below when checking lost_contact_to_peer_data() ...
+ */
_tl_restart(connection, RESEND);
_conn_request_state(connection,
(union drbd_state) { { .susp_fen = 1 } },
@@ -1742,12 +1779,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
BM_LOCKED_TEST_ALLOWED);
/* Lost contact to peer's copy of the data */
- if ((os.pdsk >= D_INCONSISTENT &&
- os.pdsk != D_UNKNOWN &&
- os.pdsk != D_OUTDATED)
- && (ns.pdsk < D_INCONSISTENT ||
- ns.pdsk == D_UNKNOWN ||
- ns.pdsk == D_OUTDATED)) {
+ if (lost_contact_to_peer_data(os.pdsk, ns.pdsk)) {
if (get_ldev(device)) {
if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
@@ -1934,12 +1966,17 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os,
/* This triggers bitmap writeout of potentially still unwritten pages
* if the resync finished cleanly, or aborted because of peer disk
- * failure, or because of connection loss.
+ * failure, or on transition from resync back to AHEAD/BEHIND.
+ *
+ * Connection loss is handled in drbd_disconnected() by the receiver.
+ *
* For resync aborted because of local disk failure, we cannot do
* any bitmap writeout anymore.
+ *
* No harm done if some bits change during this phase.
*/
- if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(device)) {
+ if ((os.conn > C_CONNECTED && os.conn < C_AHEAD) &&
+ (ns.conn == C_CONNECTED || ns.conn >= C_AHEAD) && get_ldev(device)) {
drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL,
"write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
put_ldev(device);
@@ -2160,9 +2197,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union
ns.disk = os.disk;
rv = _drbd_set_state(device, ns, flags, NULL);
- if (rv < SS_SUCCESS)
- BUG();
-
+ BUG_ON(rv < SS_SUCCESS);
ns.i = device->state.i;
ns_max.role = max_role(ns.role, ns_max.role);
ns_max.peer = max_role(ns.peer, ns_max.peer);
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h
index bd989536f888..6c9d5d4a8a75 100644
--- a/drivers/block/drbd/drbd_state.h
+++ b/drivers/block/drbd/drbd_state.h
@@ -140,7 +140,7 @@ extern void drbd_resume_al(struct drbd_device *device);
extern bool conn_all_vols_unconf(struct drbd_connection *connection);
/**
- * drbd_request_state() - Reqest a state change
+ * drbd_request_state() - Request a state change
* @device: DRBD device.
* @mask: mask of state bits to change.
* @val: value of new state bits.
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
index 80b0f63c7075..0eeab14776e9 100644
--- a/drivers/block/drbd/drbd_strings.c
+++ b/drivers/block/drbd/drbd_strings.c
@@ -26,7 +26,7 @@
#include <linux/drbd.h>
#include "drbd_strings.h"
-static const char *drbd_conn_s_names[] = {
+static const char * const drbd_conn_s_names[] = {
[C_STANDALONE] = "StandAlone",
[C_DISCONNECTING] = "Disconnecting",
[C_UNCONNECTED] = "Unconnected",
@@ -53,13 +53,13 @@ static const char *drbd_conn_s_names[] = {
[C_BEHIND] = "Behind",
};
-static const char *drbd_role_s_names[] = {
+static const char * const drbd_role_s_names[] = {
[R_PRIMARY] = "Primary",
[R_SECONDARY] = "Secondary",
[R_UNKNOWN] = "Unknown"
};
-static const char *drbd_disk_s_names[] = {
+static const char * const drbd_disk_s_names[] = {
[D_DISKLESS] = "Diskless",
[D_ATTACHING] = "Attaching",
[D_FAILED] = "Failed",
@@ -71,7 +71,7 @@ static const char *drbd_disk_s_names[] = {
[D_UP_TO_DATE] = "UpToDate",
};
-static const char *drbd_state_sw_errors[] = {
+static const char * const drbd_state_sw_errors[] = {
[-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
[-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data",
[-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 4d87499f0d54..35dbb3dca47e 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -173,8 +173,8 @@ void drbd_peer_request_endio(struct bio *bio)
{
struct drbd_peer_request *peer_req = bio->bi_private;
struct drbd_device *device = peer_req->peer_device->device;
- int is_write = bio_data_dir(bio) == WRITE;
- int is_discard = !!(bio->bi_rw & REQ_DISCARD);
+ bool is_write = bio_data_dir(bio) == WRITE;
+ bool is_discard = !!(bio_op(bio) == REQ_OP_DISCARD);
if (bio->bi_error && __ratelimit(&drbd_ratelimit_state))
drbd_warn(device, "%s: error=%d s=%llus\n",
@@ -248,18 +248,26 @@ void drbd_request_endio(struct bio *bio)
/* to avoid recursion in __req_mod */
if (unlikely(bio->bi_error)) {
- if (bio->bi_rw & REQ_DISCARD)
- what = (bio->bi_error == -EOPNOTSUPP)
- ? DISCARD_COMPLETED_NOTSUPP
- : DISCARD_COMPLETED_WITH_ERROR;
- else
- what = (bio_data_dir(bio) == WRITE)
- ? WRITE_COMPLETED_WITH_ERROR
- : (bio_rw(bio) == READ)
- ? READ_COMPLETED_WITH_ERROR
- : READ_AHEAD_COMPLETED_WITH_ERROR;
- } else
+ switch (bio_op(bio)) {
+ case REQ_OP_DISCARD:
+ if (bio->bi_error == -EOPNOTSUPP)
+ what = DISCARD_COMPLETED_NOTSUPP;
+ else
+ what = DISCARD_COMPLETED_WITH_ERROR;
+ break;
+ case REQ_OP_READ:
+ if (bio->bi_rw & REQ_RAHEAD)
+ what = READ_AHEAD_COMPLETED_WITH_ERROR;
+ else
+ what = READ_COMPLETED_WITH_ERROR;
+ break;
+ default:
+ what = WRITE_COMPLETED_WITH_ERROR;
+ break;
+ }
+ } else {
what = COMPLETED_OK;
+ }
bio_put(req->private_bio);
req->private_bio = ERR_PTR(bio->bi_error);
@@ -320,6 +328,10 @@ void drbd_csum_bio(struct crypto_ahash *tfm, struct bio *bio, void *digest)
sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
ahash_request_set_crypt(req, &sg, NULL, sg.length);
crypto_ahash_update(req);
+ /* REQ_OP_WRITE_SAME has only one segment,
+ * checksum the payload only once. */
+ if (bio_op(bio) == REQ_OP_WRITE_SAME)
+ break;
}
ahash_request_set_crypt(req, NULL, digest, 0);
crypto_ahash_final(req);
@@ -387,7 +399,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
/* GFP_TRY, because if there is no memory available right now, this may
* be rescheduled for later. It is "only" background resync, after all. */
peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
- size, true /* has real payload */, GFP_TRY);
+ size, size, GFP_TRY);
if (!peer_req)
goto defer;
@@ -397,7 +409,8 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
spin_unlock_irq(&device->resource->req_lock);
atomic_add(size >> 9, &device->rs_sect_ev);
- if (drbd_submit_peer_request(device, peer_req, READ, DRBD_FAULT_RS_RD) == 0)
+ if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, 0,
+ DRBD_FAULT_RS_RD) == 0)
return 0;
/* If it failed because of ENOMEM, retry should help. If it failed
@@ -582,6 +595,7 @@ static int make_resync_request(struct drbd_device *const device, int cancel)
int number, rollback_i, size;
int align, requeue = 0;
int i = 0;
+ int discard_granularity = 0;
if (unlikely(cancel))
return 0;
@@ -601,6 +615,12 @@ static int make_resync_request(struct drbd_device *const device, int cancel)
return 0;
}
+ if (connection->agreed_features & DRBD_FF_THIN_RESYNC) {
+ rcu_read_lock();
+ discard_granularity = rcu_dereference(device->ldev->disk_conf)->rs_discard_granularity;
+ rcu_read_unlock();
+ }
+
max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
number = drbd_rs_number_requests(device);
if (number <= 0)
@@ -665,6 +685,9 @@ next_sector:
if (sector & ((1<<(align+3))-1))
break;
+ if (discard_granularity && size == discard_granularity)
+ break;
+
/* do not cross extent boundaries */
if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
break;
@@ -711,7 +734,8 @@ next_sector:
int err;
inc_rs_pending(device);
- err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST,
+ err = drbd_send_drequest(peer_device,
+ size == discard_granularity ? P_RS_THIN_REQ : P_RS_DATA_REQUEST,
sector, size, ID_SYNCER);
if (err) {
drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
@@ -828,6 +852,7 @@ static void ping_peer(struct drbd_device *device)
int drbd_resync_finished(struct drbd_device *device)
{
+ struct drbd_connection *connection = first_peer_device(device)->connection;
unsigned long db, dt, dbdt;
unsigned long n_oos;
union drbd_state os, ns;
@@ -849,8 +874,7 @@ int drbd_resync_finished(struct drbd_device *device)
if (dw) {
dw->w.cb = w_resync_finished;
dw->device = device;
- drbd_queue_work(&first_peer_device(device)->connection->sender_work,
- &dw->w);
+ drbd_queue_work(&connection->sender_work, &dw->w);
return 1;
}
drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n");
@@ -963,6 +987,30 @@ int drbd_resync_finished(struct drbd_device *device)
_drbd_set_state(device, ns, CS_VERBOSE, NULL);
out_unlock:
spin_unlock_irq(&device->resource->req_lock);
+
+ /* If we have been sync source, and have an effective fencing-policy,
+ * once *all* volumes are back in sync, call "unfence". */
+ if (os.conn == C_SYNC_SOURCE) {
+ enum drbd_disk_state disk_state = D_MASK;
+ enum drbd_disk_state pdsk_state = D_MASK;
+ enum drbd_fencing_p fp = FP_DONT_CARE;
+
+ rcu_read_lock();
+ fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+ if (fp != FP_DONT_CARE) {
+ struct drbd_peer_device *peer_device;
+ int vnr;
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ disk_state = min_t(enum drbd_disk_state, disk_state, device->state.disk);
+ pdsk_state = min_t(enum drbd_disk_state, pdsk_state, device->state.pdsk);
+ }
+ }
+ rcu_read_unlock();
+ if (disk_state == D_UP_TO_DATE && pdsk_state == D_UP_TO_DATE)
+ conn_khelper(connection, "unfence-peer");
+ }
+
put_ldev(device);
out:
device->rs_total = 0;
@@ -999,7 +1047,6 @@ static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_
/**
* w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
- * @device: DRBD device.
* @w: work object.
* @cancel: The connection will be closed anyways
*/
@@ -1035,6 +1082,30 @@ int w_e_end_data_req(struct drbd_work *w, int cancel)
return err;
}
+static bool all_zero(struct drbd_peer_request *peer_req)
+{
+ struct page *page = peer_req->pages;
+ unsigned int len = peer_req->i.size;
+
+ page_chain_for_each(page) {
+ unsigned int l = min_t(unsigned int, len, PAGE_SIZE);
+ unsigned int i, words = l / sizeof(long);
+ unsigned long *d;
+
+ d = kmap_atomic(page);
+ for (i = 0; i < words; i++) {
+ if (d[i]) {
+ kunmap_atomic(d);
+ return false;
+ }
+ }
+ kunmap_atomic(d);
+ len -= l;
+ }
+
+ return true;
+}
+
/**
* w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
* @w: work object.
@@ -1063,7 +1134,10 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
} else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
if (likely(device->state.pdsk >= D_INCONSISTENT)) {
inc_rs_pending(device);
- err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
+ if (peer_req->flags & EE_RS_THIN_REQ && all_zero(peer_req))
+ err = drbd_send_rs_deallocated(peer_device, peer_req);
+ else
+ err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
} else {
if (__ratelimit(&drbd_ratelimit_state))
drbd_err(device, "Not sending RSDataReply, "
@@ -1633,7 +1707,7 @@ static bool use_checksum_based_resync(struct drbd_connection *connection, struct
rcu_read_unlock();
return connection->agreed_pro_version >= 89 && /* supported? */
connection->csums_tfm && /* configured? */
- (csums_after_crash_only == 0 /* use for each resync? */
+ (csums_after_crash_only == false /* use for each resync? */
|| test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */
}
@@ -1768,7 +1842,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
device->bm_resync_fo = 0;
device->use_csums = use_checksum_based_resync(connection, device);
} else {
- device->use_csums = 0;
+ device->use_csums = false;
}
/* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 84708a5f8c52..c557057fe8ae 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3822,8 +3822,9 @@ static int __floppy_read_block_0(struct block_device *bdev, int drive)
bio.bi_flags |= (1 << BIO_QUIET);
bio.bi_private = &cbdata;
bio.bi_end_io = floppy_rb0_cb;
+ bio_set_op_attrs(&bio, REQ_OP_READ, 0);
- submit_bio(READ, &bio);
+ submit_bio(&bio);
process_fd_request();
init_completion(&cbdata.complete);
@@ -4349,8 +4350,7 @@ static int __init do_floppy_init(void)
/* to be cleaned up... */
disks[drive]->private_data = (void *)(long)drive;
disks[drive]->flags |= GENHD_FL_REMOVABLE;
- disks[drive]->driverfs_dev = &floppy_device[drive].dev;
- add_disk(disks[drive]);
+ device_add_disk(&floppy_device[drive].dev, disks[drive]);
}
return 0;
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 1fa8cc235977..075377eee0c0 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -447,7 +447,7 @@ static int lo_req_flush(struct loop_device *lo, struct request *rq)
static inline void handle_partial_read(struct loop_cmd *cmd, long bytes)
{
- if (bytes < 0 || (cmd->rq->cmd_flags & REQ_WRITE))
+ if (bytes < 0 || op_is_write(req_op(cmd->rq)))
return;
if (unlikely(bytes < blk_rq_bytes(cmd->rq))) {
@@ -541,10 +541,10 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
- if (rq->cmd_flags & REQ_WRITE) {
- if (rq->cmd_flags & REQ_FLUSH)
+ if (op_is_write(req_op(rq))) {
+ if (req_op(rq) == REQ_OP_FLUSH)
ret = lo_req_flush(lo, rq);
- else if (rq->cmd_flags & REQ_DISCARD)
+ else if (req_op(rq) == REQ_OP_DISCARD)
ret = lo_discard(lo, rq, pos);
else if (lo->transfer)
ret = lo_write_transfer(lo, rq, pos);
@@ -1659,8 +1659,8 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
if (lo->lo_state != Lo_bound)
return -EIO;
- if (lo->use_dio && !(cmd->rq->cmd_flags & (REQ_FLUSH |
- REQ_DISCARD)))
+ if (lo->use_dio && (req_op(cmd->rq) != REQ_OP_FLUSH ||
+ req_op(cmd->rq) == REQ_OP_DISCARD))
cmd->use_aio = true;
else
cmd->use_aio = false;
@@ -1672,7 +1672,7 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
static void loop_handle_cmd(struct loop_cmd *cmd)
{
- const bool write = cmd->rq->cmd_flags & REQ_WRITE;
+ const bool write = op_is_write(req_op(cmd->rq));
struct loop_device *lo = cmd->rq->q->queuedata;
int ret = 0;
@@ -1765,6 +1765,7 @@ static int loop_add(struct loop_device **l, int i)
*/
queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue);
+ err = -ENOMEM;
disk = lo->lo_disk = alloc_disk(1 << part_shift);
if (!disk)
goto out_free_queue;
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 145ce2aa2e78..e937fcf71769 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -687,15 +687,13 @@ static unsigned int mg_issue_req(struct request *req,
unsigned int sect_num,
unsigned int sect_cnt)
{
- switch (rq_data_dir(req)) {
- case READ:
+ if (rq_data_dir(req) == READ) {
if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr)
!= MG_ERR_NONE) {
mg_bad_rw_intr(host);
return host->error;
}
- break;
- case WRITE:
+ } else {
/* TODO : handler */
outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr)
@@ -714,7 +712,6 @@ static unsigned int mg_issue_req(struct request *req,
mod_timer(&host->timer, jiffies + 3 * HZ);
outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
MG_REG_COMMAND);
- break;
}
return MG_ERR_NONE;
}
@@ -1018,7 +1015,7 @@ probe_err_7:
probe_err_6:
blk_cleanup_queue(host->breq);
probe_err_5:
- unregister_blkdev(MG_DISK_MAJ, MG_DISK_NAME);
+ unregister_blkdev(host->major, MG_DISK_NAME);
probe_err_4:
if (!prv_data->use_polling)
free_irq(host->irq, host);
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 6053e4659fa2..2aca98e8e427 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3765,7 +3765,7 @@ static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
return -ENODATA;
}
- if (rq->cmd_flags & REQ_DISCARD) {
+ if (req_op(rq) == REQ_OP_DISCARD) {
int err;
err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq));
@@ -3956,7 +3956,6 @@ static int mtip_block_initialize(struct driver_data *dd)
if (rv)
goto disk_index_error;
- dd->disk->driverfs_dev = &dd->pdev->dev;
dd->disk->major = dd->major;
dd->disk->first_minor = index * MTIP_MAX_MINORS;
dd->disk->minors = MTIP_MAX_MINORS;
@@ -4008,7 +4007,7 @@ skip_create_disk:
/*
* if rebuild pending, start the service thread, and delay the block
- * queue creation and add_disk()
+ * queue creation and device_add_disk()
*/
if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
goto start_service_thread;
@@ -4042,7 +4041,7 @@ skip_create_disk:
set_capacity(dd->disk, capacity);
/* Enable the block device and add it to /dev */
- add_disk(dd->disk);
+ device_add_disk(&dd->pdev->dev, dd->disk);
dd->bdev = bdget_disk(dd->disk, 0);
/*
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 6a48ed41963f..6f55b262b5ce 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -282,9 +282,9 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req)
if (req->cmd_type == REQ_TYPE_DRV_PRIV)
type = NBD_CMD_DISC;
- else if (req->cmd_flags & REQ_DISCARD)
+ else if (req_op(req) == REQ_OP_DISCARD)
type = NBD_CMD_TRIM;
- else if (req->cmd_flags & REQ_FLUSH)
+ else if (req_op(req) == REQ_OP_FLUSH)
type = NBD_CMD_FLUSH;
else if (rq_data_dir(req) == WRITE)
type = NBD_CMD_WRITE;
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index cab97593ba54..75a7f88d6717 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -448,7 +448,7 @@ static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
struct request *rq;
struct bio *bio = rqd->bio;
- rq = blk_mq_alloc_request(q, bio_rw(bio), 0);
+ rq = blk_mq_alloc_request(q, bio_data_dir(bio), 0);
if (IS_ERR(rq))
return -ENOMEM;
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index c2854a2bfdb0..92900f5f0b47 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -321,7 +321,7 @@ static void osdblk_rq_fn(struct request_queue *q)
* driver-specific, etc.
*/
- do_flush = rq->cmd_flags & REQ_FLUSH;
+ do_flush = (req_op(rq) == REQ_OP_FLUSH);
do_write = (rq_data_dir(rq) == WRITE);
if (!do_flush) { /* osd_flush does not use a bio */
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index d06c62eccdf0..9393bc730acf 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1074,7 +1074,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
BUG();
atomic_inc(&pkt->io_wait);
- bio->bi_rw = READ;
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
pkt_queue_bio(pd, bio);
frames_read++;
}
@@ -1336,7 +1336,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
/* Start the write request */
atomic_set(&pkt->io_wait, 1);
- pkt->w_bio->bi_rw = WRITE;
+ bio_set_op_attrs(pkt->w_bio, REQ_OP_WRITE, 0);
pkt_queue_bio(pd, pkt->w_bio);
}
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 4b7e405830d7..76f33c84ce3d 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -196,7 +196,7 @@ static void ps3disk_do_request(struct ps3_storage_device *dev,
dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__);
while ((req = blk_fetch_request(q))) {
- if (req->cmd_flags & REQ_FLUSH) {
+ if (req_op(req) == REQ_OP_FLUSH) {
if (ps3disk_submit_flush_request(dev, req))
break;
} else if (req->cmd_type == REQ_TYPE_FS) {
@@ -256,7 +256,7 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data)
return IRQ_HANDLED;
}
- if (req->cmd_flags & REQ_FLUSH) {
+ if (req_op(req) == REQ_OP_FLUSH) {
read = 0;
op = "flush";
} else {
@@ -487,7 +487,6 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
gendisk->fops = &ps3disk_fops;
gendisk->queue = queue;
gendisk->private_data = dev;
- gendisk->driverfs_dev = &dev->sbd.core;
snprintf(gendisk->disk_name, sizeof(gendisk->disk_name), PS3DISK_NAME,
devidx+'a');
priv->blocking_factor = dev->blk_size >> 9;
@@ -499,7 +498,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
gendisk->disk_name, priv->model, priv->raw_capacity >> 11,
get_capacity(gendisk) >> 11);
- add_disk(gendisk);
+ device_add_disk(&dev->sbd.core, gendisk);
return 0;
fail_cleanup_queue:
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 56847fcda086..456b4fe21559 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -773,14 +773,13 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev)
gendisk->fops = &ps3vram_fops;
gendisk->queue = queue;
gendisk->private_data = dev;
- gendisk->driverfs_dev = &dev->core;
strlcpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name));
set_capacity(gendisk, priv->size >> 9);
dev_info(&dev->core, "%s: Using %lu MiB of GPU memory\n",
gendisk->disk_name, get_capacity(gendisk) >> 11);
- add_disk(gendisk);
+ device_add_disk(&dev->core, gendisk);
return 0;
fail_cleanup_queue:
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 81666a56415e..450662055d97 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3286,9 +3286,9 @@ static void rbd_queue_workfn(struct work_struct *work)
goto err;
}
- if (rq->cmd_flags & REQ_DISCARD)
+ if (req_op(rq) == REQ_OP_DISCARD)
op_type = OBJ_OP_DISCARD;
- else if (rq->cmd_flags & REQ_WRITE)
+ else if (req_op(rq) == REQ_OP_WRITE)
op_type = OBJ_OP_WRITE;
else
op_type = OBJ_OP_READ;
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index e1b8b7061d2f..f81d70b39d10 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -230,8 +230,7 @@ int rsxx_attach_dev(struct rsxx_cardinfo *card)
set_capacity(card->gendisk, card->size8 >> 9);
else
set_capacity(card->gendisk, 0);
- add_disk(card->gendisk);
-
+ device_add_disk(CARD_TO_DEV(card), card->gendisk);
card->bdev_attached = 1;
}
@@ -308,7 +307,6 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card)
snprintf(card->gendisk->disk_name, sizeof(card->gendisk->disk_name),
"rsxx%d", card->disk_id);
- card->gendisk->driverfs_dev = &card->dev->dev;
card->gendisk->major = card->major;
card->gendisk->first_minor = 0;
card->gendisk->fops = &rsxx_fops;
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
index cf8cd293abb5..5a20385f87d0 100644
--- a/drivers/block/rsxx/dma.c
+++ b/drivers/block/rsxx/dma.c
@@ -705,7 +705,7 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
dma_cnt[i] = 0;
}
- if (bio->bi_rw & REQ_DISCARD) {
+ if (bio_op(bio) == REQ_OP_DISCARD) {
bv_len = bio->bi_iter.bi_size;
while (bv_len > 0) {
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 910e065918af..3822eae102db 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -597,7 +597,7 @@ static void skd_request_fn(struct request_queue *q)
data_dir = rq_data_dir(req);
io_flags = req->cmd_flags;
- if (io_flags & REQ_FLUSH)
+ if (req_op(req) == REQ_OP_FLUSH)
flush++;
if (io_flags & REQ_FUA)
@@ -4690,10 +4690,10 @@ static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return -EIO;
}
-static int skd_bdev_attach(struct skd_device *skdev)
+static int skd_bdev_attach(struct device *parent, struct skd_device *skdev)
{
pr_debug("%s:%s:%d add_disk\n", skdev->name, __func__, __LINE__);
- add_disk(skdev->disk);
+ device_add_disk(parent, skdev->disk);
return 0;
}
@@ -4812,8 +4812,6 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
pci_set_drvdata(pdev, skdev);
- skdev->disk->driverfs_dev = &pdev->dev;
-
for (i = 0; i < SKD_MAX_BARS; i++) {
skdev->mem_phys[i] = pci_resource_start(pdev, i);
skdev->mem_size[i] = (u32)pci_resource_len(pdev, i);
@@ -4851,7 +4849,7 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
(SKD_START_WAIT_SECONDS * HZ));
if (skdev->gendisk_on > 0) {
/* device came on-line after reset */
- skd_bdev_attach(skdev);
+ skd_bdev_attach(&pdev->dev, skdev);
rc = 0;
} else {
/* we timed out, something is wrong with the device,
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 4b911ed96ea3..cab157331c4e 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -804,7 +804,6 @@ static int probe_disk(struct vdc_port *port)
g->fops = &vdc_fops;
g->queue = q;
g->private_data = port;
- g->driverfs_dev = &port->vio.vdev->dev;
set_capacity(g, port->vdisk_size);
@@ -835,7 +834,7 @@ static int probe_disk(struct vdc_port *port)
port->vdisk_size, (port->vdisk_size >> (20 - 9)),
port->vio.ver.major, port->vio.ver.minor);
- add_disk(g);
+ device_add_disk(&port->vio.vdev->dev, g);
return 0;
}
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 7939b9f87441..d0a3e6d4515f 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -344,7 +344,6 @@ static int add_bio(struct cardinfo *card)
int offset;
struct bio *bio;
struct bio_vec vec;
- int rw;
bio = card->currentbio;
if (!bio && card->bio) {
@@ -359,7 +358,6 @@ static int add_bio(struct cardinfo *card)
if (!bio)
return 0;
- rw = bio_rw(bio);
if (card->mm_pages[card->Ready].cnt >= DESC_PER_PAGE)
return 0;
@@ -369,7 +367,7 @@ static int add_bio(struct cardinfo *card)
vec.bv_page,
vec.bv_offset,
vec.bv_len,
- (rw == READ) ?
+ bio_op(bio) == REQ_OP_READ ?
PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE);
p = &card->mm_pages[card->Ready];
@@ -398,7 +396,7 @@ static int add_bio(struct cardinfo *card)
DMASCR_CHAIN_EN |
DMASCR_SEM_EN |
pci_cmds);
- if (rw == WRITE)
+ if (bio_op(bio) == REQ_OP_WRITE)
desc->control_bits |= cpu_to_le32(DMASCR_TRANSFER_READ);
desc->sem_control_bits = desc->control_bits;
@@ -462,7 +460,7 @@ static void process_page(unsigned long data)
le32_to_cpu(desc->local_addr)>>9,
le32_to_cpu(desc->transfer_size));
dump_dmastat(card, control);
- } else if ((bio->bi_rw & REQ_WRITE) &&
+ } else if (op_is_write(bio_op(bio)) &&
le32_to_cpu(desc->local_addr) >> 9 ==
card->init_size) {
card->init_size += le32_to_cpu(desc->transfer_size) >> 9;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 42758b52768c..1523e05c46fc 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -172,7 +172,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
vbr->req = req;
- if (req->cmd_flags & REQ_FLUSH) {
+ if (req_op(req) == REQ_OP_FLUSH) {
vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_FLUSH);
vbr->out_hdr.sector = 0;
vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
@@ -236,25 +236,22 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
static int virtblk_get_id(struct gendisk *disk, char *id_str)
{
struct virtio_blk *vblk = disk->private_data;
+ struct request_queue *q = vblk->disk->queue;
struct request *req;
- struct bio *bio;
int err;
- bio = bio_map_kern(vblk->disk->queue, id_str, VIRTIO_BLK_ID_BYTES,
- GFP_KERNEL);
- if (IS_ERR(bio))
- return PTR_ERR(bio);
-
- req = blk_make_request(vblk->disk->queue, bio, GFP_KERNEL);
- if (IS_ERR(req)) {
- bio_put(bio);
+ req = blk_get_request(q, READ, GFP_KERNEL);
+ if (IS_ERR(req))
return PTR_ERR(req);
- }
-
req->cmd_type = REQ_TYPE_DRV_PRIV;
+
+ err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
+ if (err)
+ goto out;
+
err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
+out:
blk_put_request(req);
-
return err;
}
@@ -656,7 +653,6 @@ static int virtblk_probe(struct virtio_device *vdev)
vblk->disk->first_minor = index_to_minor(index);
vblk->disk->private_data = vblk;
vblk->disk->fops = &virtblk_fops;
- vblk->disk->driverfs_dev = &vdev->dev;
vblk->disk->flags |= GENHD_FL_EXT_DEVT;
vblk->index = index;
@@ -733,7 +729,7 @@ static int virtblk_probe(struct virtio_device *vdev)
virtio_device_ready(vdev);
- add_disk(vblk->disk);
+ device_add_disk(&vdev->dev, vblk->disk);
err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial);
if (err)
goto out_del_disk;
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 4809c1501d7e..4a80ee752597 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -501,7 +501,7 @@ static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
struct xen_vbd *vbd = &blkif->vbd;
int rc = -EACCES;
- if ((operation != READ) && vbd->readonly)
+ if ((operation != REQ_OP_READ) && vbd->readonly)
goto out;
if (likely(req->nr_sects)) {
@@ -1014,7 +1014,7 @@ static int dispatch_discard_io(struct xen_blkif_ring *ring,
preq.sector_number = req->u.discard.sector_number;
preq.nr_sects = req->u.discard.nr_sectors;
- err = xen_vbd_translate(&preq, blkif, WRITE);
+ err = xen_vbd_translate(&preq, blkif, REQ_OP_WRITE);
if (err) {
pr_warn("access denied: DISCARD [%llu->%llu] on dev=%04x\n",
preq.sector_number,
@@ -1229,6 +1229,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
struct bio **biolist = pending_req->biolist;
int i, nbio = 0;
int operation;
+ int operation_flags = 0;
struct blk_plug plug;
bool drain = false;
struct grant_page **pages = pending_req->segments;
@@ -1247,17 +1248,19 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
switch (req_operation) {
case BLKIF_OP_READ:
ring->st_rd_req++;
- operation = READ;
+ operation = REQ_OP_READ;
break;
case BLKIF_OP_WRITE:
ring->st_wr_req++;
- operation = WRITE_ODIRECT;
+ operation = REQ_OP_WRITE;
+ operation_flags = WRITE_ODIRECT;
break;
case BLKIF_OP_WRITE_BARRIER:
drain = true;
case BLKIF_OP_FLUSH_DISKCACHE:
ring->st_f_req++;
- operation = WRITE_FLUSH;
+ operation = REQ_OP_WRITE;
+ operation_flags = WRITE_FLUSH;
break;
default:
operation = 0; /* make gcc happy */
@@ -1269,7 +1272,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
nseg = req->operation == BLKIF_OP_INDIRECT ?
req->u.indirect.nr_segments : req->u.rw.nr_segments;
- if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
+ if (unlikely(nseg == 0 && operation_flags != WRITE_FLUSH) ||
unlikely((req->operation != BLKIF_OP_INDIRECT) &&
(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
unlikely((req->operation == BLKIF_OP_INDIRECT) &&
@@ -1310,7 +1313,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) {
pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
- operation == READ ? "read" : "write",
+ operation == REQ_OP_READ ? "read" : "write",
preq.sector_number,
preq.sector_number + preq.nr_sects,
ring->blkif->vbd.pdevice);
@@ -1369,6 +1372,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
bio->bi_private = pending_req;
bio->bi_end_io = end_block_io_op;
bio->bi_iter.bi_sector = preq.sector_number;
+ bio_set_op_attrs(bio, operation, operation_flags);
}
preq.sector_number += seg[i].nsec;
@@ -1376,7 +1380,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
/* This will be hit if the operation was a flush or discard. */
if (!bio) {
- BUG_ON(operation != WRITE_FLUSH);
+ BUG_ON(operation_flags != WRITE_FLUSH);
bio = bio_alloc(GFP_KERNEL, 0);
if (unlikely(bio == NULL))
@@ -1386,20 +1390,21 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
bio->bi_bdev = preq.bdev;
bio->bi_private = pending_req;
bio->bi_end_io = end_block_io_op;
+ bio_set_op_attrs(bio, operation, operation_flags);
}
atomic_set(&pending_req->pendcnt, nbio);
blk_start_plug(&plug);
for (i = 0; i < nbio; i++)
- submit_bio(operation, biolist[i]);
+ submit_bio(biolist[i]);
/* Let the I/Os go.. */
blk_finish_plug(&plug);
- if (operation == READ)
+ if (operation == REQ_OP_READ)
ring->st_rd_sect += preq.nr_sects;
- else if (operation & WRITE)
+ else if (operation == REQ_OP_WRITE)
ring->st_wr_sect += preq.nr_sects;
return 0;
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 3355f1cdd4e5..3cc6d1d86f1e 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -379,7 +379,7 @@ static struct attribute *xen_vbdstat_attrs[] = {
NULL
};
-static struct attribute_group xen_vbdstat_group = {
+static const struct attribute_group xen_vbdstat_group = {
.name = "statistics",
.attrs = xen_vbdstat_attrs,
};
@@ -480,7 +480,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags))
vbd->flush_support = true;
- if (q && blk_queue_secdiscard(q))
+ if (q && blk_queue_secure_erase(q))
vbd->discard_secure = true;
pr_debug("Successful creation of handle=%04x (dom=%u)\n",
@@ -715,8 +715,11 @@ static void backend_changed(struct xenbus_watch *watch,
/* Front end dir is a number, which is used as the handle. */
err = kstrtoul(strrchr(dev->otherend, '/') + 1, 0, &handle);
- if (err)
+ if (err) {
+ kfree(be->mode);
+ be->mode = NULL;
return;
+ }
be->major = major;
be->minor = minor;
@@ -1022,9 +1025,9 @@ static int connect_ring(struct backend_info *be)
pr_debug("%s %s\n", __func__, dev->otherend);
be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
- err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
- "%63s", protocol, NULL);
- if (err)
+ err = xenbus_scanf(XBT_NIL, dev->otherend, "protocol",
+ "%63s", protocol);
+ if (err <= 0)
strcpy(protocol, "unspecified, assuming default");
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
@@ -1036,10 +1039,9 @@ static int connect_ring(struct backend_info *be)
xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
return -ENOSYS;
}
- err = xenbus_gather(XBT_NIL, dev->otherend,
- "feature-persistent", "%u",
- &pers_grants, NULL);
- if (err)
+ err = xenbus_scanf(XBT_NIL, dev->otherend,
+ "feature-persistent", "%u", &pers_grants);
+ if (err <= 0)
pers_grants = 0;
be->blkif->vbd.feature_gnt_persistent = pers_grants;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index fcc5b4e0aef2..be4fea6a5dd3 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -196,6 +196,7 @@ struct blkfront_info
unsigned int nr_ring_pages;
struct request_queue *rq;
unsigned int feature_flush;
+ unsigned int feature_fua;
unsigned int feature_discard:1;
unsigned int feature_secdiscard:1;
unsigned int discard_granularity;
@@ -547,7 +548,7 @@ static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_inf
ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
ring_req->u.discard.id = id;
ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
- if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
+ if (req_op(req) == REQ_OP_SECURE_ERASE && info->feature_secdiscard)
ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
else
ring_req->u.discard.flag = 0;
@@ -746,7 +747,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
* The indirect operation can only be a BLKIF_OP_READ or
* BLKIF_OP_WRITE
*/
- BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA));
+ BUG_ON(req_op(req) == REQ_OP_FLUSH || req->cmd_flags & REQ_FUA);
ring_req->operation = BLKIF_OP_INDIRECT;
ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
BLKIF_OP_WRITE : BLKIF_OP_READ;
@@ -758,7 +759,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
ring_req->u.rw.handle = info->handle;
ring_req->operation = rq_data_dir(req) ?
BLKIF_OP_WRITE : BLKIF_OP_READ;
- if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
+ if (req_op(req) == REQ_OP_FLUSH || req->cmd_flags & REQ_FUA) {
/*
* Ideally we can do an unordered flush-to-disk.
* In case the backend onlysupports barriers, use that.
@@ -766,19 +767,14 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
* implement it the same way. (It's also a FLUSH+FUA,
* since it is guaranteed ordered WRT previous writes.)
*/
- switch (info->feature_flush &
- ((REQ_FLUSH|REQ_FUA))) {
- case REQ_FLUSH|REQ_FUA:
+ if (info->feature_flush && info->feature_fua)
ring_req->operation =
BLKIF_OP_WRITE_BARRIER;
- break;
- case REQ_FLUSH:
+ else if (info->feature_flush)
ring_req->operation =
BLKIF_OP_FLUSH_DISKCACHE;
- break;
- default:
+ else
ring_req->operation = 0;
- }
}
ring_req->u.rw.nr_segments = num_grant;
if (unlikely(require_extra_req)) {
@@ -847,7 +843,8 @@ static int blkif_queue_request(struct request *req, struct blkfront_ring_info *r
if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED))
return 1;
- if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE)))
+ if (unlikely(req_op(req) == REQ_OP_DISCARD ||
+ req_op(req) == REQ_OP_SECURE_ERASE))
return blkif_queue_discard_req(req, rinfo);
else
return blkif_queue_rw_req(req, rinfo);
@@ -867,10 +864,10 @@ static inline bool blkif_request_flush_invalid(struct request *req,
struct blkfront_info *info)
{
return ((req->cmd_type != REQ_TYPE_FS) ||
- ((req->cmd_flags & REQ_FLUSH) &&
- !(info->feature_flush & REQ_FLUSH)) ||
+ ((req_op(req) == REQ_OP_FLUSH) &&
+ !info->feature_flush) ||
((req->cmd_flags & REQ_FUA) &&
- !(info->feature_flush & REQ_FUA)));
+ !info->feature_fua));
}
static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -955,7 +952,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
rq->limits.discard_granularity = info->discard_granularity;
rq->limits.discard_alignment = info->discard_alignment;
if (info->feature_secdiscard)
- queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq);
+ queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, rq);
}
/* Hard sector size and max sectors impersonate the equiv. hardware. */
@@ -981,24 +978,22 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
return 0;
}
-static const char *flush_info(unsigned int feature_flush)
+static const char *flush_info(struct blkfront_info *info)
{
- switch (feature_flush & ((REQ_FLUSH | REQ_FUA))) {
- case REQ_FLUSH|REQ_FUA:
+ if (info->feature_flush && info->feature_fua)
return "barrier: enabled;";
- case REQ_FLUSH:
+ else if (info->feature_flush)
return "flush diskcache: enabled;";
- default:
+ else
return "barrier or flush: disabled;";
- }
}
static void xlvbd_flush(struct blkfront_info *info)
{
- blk_queue_write_cache(info->rq, info->feature_flush & REQ_FLUSH,
- info->feature_flush & REQ_FUA);
+ blk_queue_write_cache(info->rq, info->feature_flush ? true : false,
+ info->feature_fua ? true : false);
pr_info("blkfront: %s: %s %s %s %s %s\n",
- info->gd->disk_name, flush_info(info->feature_flush),
+ info->gd->disk_name, flush_info(info),
"persistent grants:", info->feature_persistent ?
"enabled;" : "disabled;", "indirect descriptors:",
info->max_indirect_segments ? "enabled;" : "disabled;");
@@ -1139,7 +1134,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
gd->first_minor = minor;
gd->fops = &xlvbd_block_fops;
gd->private_data = info;
- gd->driverfs_dev = &(info->xbdev->dev);
set_capacity(gd, capacity);
if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size,
@@ -1597,7 +1591,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
info->feature_discard = 0;
info->feature_secdiscard = 0;
queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
- queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
+ queue_flag_clear(QUEUE_FLAG_SECERASE, rq);
}
blk_mq_complete_request(req, error);
break;
@@ -1617,6 +1611,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
if (unlikely(error)) {
if (error == -EOPNOTSUPP)
error = 0;
+ info->feature_fua = 0;
info->feature_flush = 0;
xlvbd_flush(info);
}
@@ -2064,7 +2059,7 @@ static int blkif_recover(struct blkfront_info *info)
bio_trim(cloned_bio, offset, size);
cloned_bio->bi_private = split_bio;
cloned_bio->bi_end_io = split_bio_end;
- submit_bio(cloned_bio->bi_rw, cloned_bio);
+ submit_bio(cloned_bio);
}
/*
* Now we have to wait for all those smaller bios to
@@ -2073,7 +2068,7 @@ static int blkif_recover(struct blkfront_info *info)
continue;
}
/* We don't need to split this bio */
- submit_bio(bio->bi_rw, bio);
+ submit_bio(bio);
}
return 0;
@@ -2108,11 +2103,16 @@ static int blkfront_resume(struct xenbus_device *dev)
/*
* Get the bios in the request so we can re-queue them.
*/
- if (shadow[j].request->cmd_flags &
- (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+ if (req_op(shadow[i].request) == REQ_OP_FLUSH ||
+ req_op(shadow[i].request) == REQ_OP_DISCARD ||
+ req_op(shadow[i].request) == REQ_OP_SECURE_ERASE ||
+ shadow[j].request->cmd_flags & REQ_FUA) {
/*
* Flush operations don't contain bios, so
* we need to requeue the whole request
+ *
+ * XXX: but this doesn't make any sense for a
+ * write with the FUA flag set..
*/
list_add(&shadow[j].request->queuelist, &info->requests);
continue;
@@ -2197,10 +2197,9 @@ static void blkfront_setup_discard(struct blkfront_info *info)
info->discard_granularity = discard_granularity;
info->discard_alignment = discard_alignment;
}
- err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
- "discard-secure", "%d", &discard_secure,
- NULL);
- if (!err)
+ err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+ "discard-secure", "%u", &discard_secure);
+ if (err > 0)
info->feature_secdiscard = !!discard_secure;
}
@@ -2298,10 +2297,10 @@ static void blkfront_gather_backend_features(struct blkfront_info *info)
unsigned int indirect_segments;
info->feature_flush = 0;
+ info->feature_fua = 0;
- err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
- "feature-barrier", "%d", &barrier,
- NULL);
+ err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+ "feature-barrier", "%d", &barrier);
/*
* If there's no "feature-barrier" defined, then it means
@@ -2310,38 +2309,40 @@ static void blkfront_gather_backend_features(struct blkfront_info *info)
*
* If there are barriers, then we use flush.
*/
- if (!err && barrier)
- info->feature_flush = REQ_FLUSH | REQ_FUA;
+ if (err > 0 && barrier) {
+ info->feature_flush = 1;
+ info->feature_fua = 1;
+ }
+
/*
* And if there is "feature-flush-cache" use that above
* barriers.
*/
- err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
- "feature-flush-cache", "%d", &flush,
- NULL);
+ err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+ "feature-flush-cache", "%d", &flush);
- if (!err && flush)
- info->feature_flush = REQ_FLUSH;
+ if (err > 0 && flush) {
+ info->feature_flush = 1;
+ info->feature_fua = 0;
+ }
- err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
- "feature-discard", "%d", &discard,
- NULL);
+ err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+ "feature-discard", "%d", &discard);
- if (!err && discard)
+ if (err > 0 && discard)
blkfront_setup_discard(info);
- err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
- "feature-persistent", "%u", &persistent,
- NULL);
- if (err)
+ err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+ "feature-persistent", "%d", &persistent);
+ if (err <= 0)
info->feature_persistent = 0;
else
info->feature_persistent = persistent;
- err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
- "feature-max-indirect-segments", "%u", &indirect_segments,
- NULL);
- if (err)
+ err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+ "feature-max-indirect-segments", "%u",
+ &indirect_segments);
+ if (err <= 0)
info->max_indirect_segments = 0;
else
info->max_indirect_segments = min(indirect_segments,
@@ -2441,7 +2442,7 @@ static void blkfront_connect(struct blkfront_info *info)
for (i = 0; i < info->nr_rings; i++)
kick_pending_request_queues(&info->rinfo[i]);
- add_disk(info->gd);
+ device_add_disk(&info->xbdev->dev, info->gd);
info->is_ready = 1;
}
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index 386ba3d1a6ee..b8ecba6dcd3b 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -1,8 +1,7 @@
config ZRAM
tristate "Compressed RAM block device support"
- depends on BLOCK && SYSFS && ZSMALLOC
- select LZO_COMPRESS
- select LZO_DECOMPRESS
+ depends on BLOCK && SYSFS && ZSMALLOC && CRYPTO
+ select CRYPTO_LZO
default n
help
Creates virtual block devices called /dev/zramX (X = 0, 1, ...).
@@ -14,13 +13,3 @@ config ZRAM
disks and maybe many more.
See zram.txt for more information.
-
-config ZRAM_LZ4_COMPRESS
- bool "Enable LZ4 algorithm support"
- depends on ZRAM
- select LZ4_COMPRESS
- select LZ4_DECOMPRESS
- default n
- help
- This option enables LZ4 compression algorithm support. Compression
- algorithm can be changed using `comp_algorithm' device attribute. \ No newline at end of file
diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
index be0763ff57a2..9e2b79e9a990 100644
--- a/drivers/block/zram/Makefile
+++ b/drivers/block/zram/Makefile
@@ -1,5 +1,3 @@
-zram-y := zcomp_lzo.o zcomp.o zram_drv.o
-
-zram-$(CONFIG_ZRAM_LZ4_COMPRESS) += zcomp_lz4.o
+zram-y := zcomp.o zram_drv.o
obj-$(CONFIG_ZRAM) += zram.o
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index b51a816d766b..4b5cd3a7b2b6 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -14,108 +14,150 @@
#include <linux/wait.h>
#include <linux/sched.h>
#include <linux/cpu.h>
+#include <linux/crypto.h>
#include "zcomp.h"
-#include "zcomp_lzo.h"
-#ifdef CONFIG_ZRAM_LZ4_COMPRESS
-#include "zcomp_lz4.h"
-#endif
-static struct zcomp_backend *backends[] = {
- &zcomp_lzo,
-#ifdef CONFIG_ZRAM_LZ4_COMPRESS
- &zcomp_lz4,
+static const char * const backends[] = {
+ "lzo",
+#if IS_ENABLED(CONFIG_CRYPTO_LZ4)
+ "lz4",
+#endif
+#if IS_ENABLED(CONFIG_CRYPTO_DEFLATE)
+ "deflate",
+#endif
+#if IS_ENABLED(CONFIG_CRYPTO_LZ4HC)
+ "lz4hc",
+#endif
+#if IS_ENABLED(CONFIG_CRYPTO_842)
+ "842",
#endif
NULL
};
-static struct zcomp_backend *find_backend(const char *compress)
-{
- int i = 0;
- while (backends[i]) {
- if (sysfs_streq(compress, backends[i]->name))
- break;
- i++;
- }
- return backends[i];
-}
-
-static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm)
+static void zcomp_strm_free(struct zcomp_strm *zstrm)
{
- if (zstrm->private)
- comp->backend->destroy(zstrm->private);
+ if (!IS_ERR_OR_NULL(zstrm->tfm))
+ crypto_free_comp(zstrm->tfm);
free_pages((unsigned long)zstrm->buffer, 1);
kfree(zstrm);
}
/*
- * allocate new zcomp_strm structure with ->private initialized by
+ * allocate new zcomp_strm structure with ->tfm initialized by
* backend, return NULL on error
*/
-static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags)
+static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp)
{
- struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), flags);
+ struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL);
if (!zstrm)
return NULL;
- zstrm->private = comp->backend->create(flags);
+ zstrm->tfm = crypto_alloc_comp(comp->name, 0, 0);
/*
* allocate 2 pages. 1 for compressed data, plus 1 extra for the
* case when compressed size is larger than the original one
*/
- zstrm->buffer = (void *)__get_free_pages(flags | __GFP_ZERO, 1);
- if (!zstrm->private || !zstrm->buffer) {
- zcomp_strm_free(comp, zstrm);
+ zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
+ if (IS_ERR_OR_NULL(zstrm->tfm) || !zstrm->buffer) {
+ zcomp_strm_free(zstrm);
zstrm = NULL;
}
return zstrm;
}
+bool zcomp_available_algorithm(const char *comp)
+{
+ int i = 0;
+
+ while (backends[i]) {
+ if (sysfs_streq(comp, backends[i]))
+ return true;
+ i++;
+ }
+
+ /*
+ * Crypto does not ignore a trailing new line symbol,
+ * so make sure you don't supply a string containing
+ * one.
+ * This also means that we permit zcomp initialisation
+ * with any compressing algorithm known to crypto api.
+ */
+ return crypto_has_comp(comp, 0, 0) == 1;
+}
+
/* show available compressors */
ssize_t zcomp_available_show(const char *comp, char *buf)
{
+ bool known_algorithm = false;
ssize_t sz = 0;
int i = 0;
- while (backends[i]) {
- if (!strcmp(comp, backends[i]->name))
+ for (; backends[i]; i++) {
+ if (!strcmp(comp, backends[i])) {
+ known_algorithm = true;
sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
- "[%s] ", backends[i]->name);
- else
+ "[%s] ", backends[i]);
+ } else {
sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
- "%s ", backends[i]->name);
- i++;
+ "%s ", backends[i]);
+ }
}
+
+ /*
+ * Out-of-tree module known to crypto api or a missing
+ * entry in `backends'.
+ */
+ if (!known_algorithm && crypto_has_comp(comp, 0, 0) == 1)
+ sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
+ "[%s] ", comp);
+
sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n");
return sz;
}
-bool zcomp_available_algorithm(const char *comp)
-{
- return find_backend(comp) != NULL;
-}
-
-struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
+struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
{
return *get_cpu_ptr(comp->stream);
}
-void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm)
+void zcomp_stream_put(struct zcomp *comp)
{
put_cpu_ptr(comp->stream);
}
-int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
- const unsigned char *src, size_t *dst_len)
+int zcomp_compress(struct zcomp_strm *zstrm,
+ const void *src, unsigned int *dst_len)
{
- return comp->backend->compress(src, zstrm->buffer, dst_len,
- zstrm->private);
+ /*
+ * Our dst memory (zstrm->buffer) is always `2 * PAGE_SIZE' sized
+ * because sometimes we can endup having a bigger compressed data
+ * due to various reasons: for example compression algorithms tend
+ * to add some padding to the compressed buffer. Speaking of padding,
+ * comp algorithm `842' pads the compressed length to multiple of 8
+ * and returns -ENOSP when the dst memory is not big enough, which
+ * is not something that ZRAM wants to see. We can handle the
+ * `compressed_size > PAGE_SIZE' case easily in ZRAM, but when we
+ * receive -ERRNO from the compressing backend we can't help it
+ * anymore. To make `842' happy we need to tell the exact size of
+ * the dst buffer, zram_drv will take care of the fact that
+ * compressed buffer is too big.
+ */
+ *dst_len = PAGE_SIZE * 2;
+
+ return crypto_comp_compress(zstrm->tfm,
+ src, PAGE_SIZE,
+ zstrm->buffer, dst_len);
}
-int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
- size_t src_len, unsigned char *dst)
+int zcomp_decompress(struct zcomp_strm *zstrm,
+ const void *src, unsigned int src_len, void *dst)
{
- return comp->backend->decompress(src, src_len, dst);
+ unsigned int dst_len = PAGE_SIZE;
+
+ return crypto_comp_decompress(zstrm->tfm,
+ src, src_len,
+ dst, &dst_len);
}
static int __zcomp_cpu_notifier(struct zcomp *comp,
@@ -127,7 +169,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp,
case CPU_UP_PREPARE:
if (WARN_ON(*per_cpu_ptr(comp->stream, cpu)))
break;
- zstrm = zcomp_strm_alloc(comp, GFP_KERNEL);
+ zstrm = zcomp_strm_alloc(comp);
if (IS_ERR_OR_NULL(zstrm)) {
pr_err("Can't allocate a compression stream\n");
return NOTIFY_BAD;
@@ -138,7 +180,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp,
case CPU_UP_CANCELED:
zstrm = *per_cpu_ptr(comp->stream, cpu);
if (!IS_ERR_OR_NULL(zstrm))
- zcomp_strm_free(comp, zstrm);
+ zcomp_strm_free(zstrm);
*per_cpu_ptr(comp->stream, cpu) = NULL;
break;
default:
@@ -209,18 +251,16 @@ void zcomp_destroy(struct zcomp *comp)
struct zcomp *zcomp_create(const char *compress)
{
struct zcomp *comp;
- struct zcomp_backend *backend;
int error;
- backend = find_backend(compress);
- if (!backend)
+ if (!zcomp_available_algorithm(compress))
return ERR_PTR(-EINVAL);
comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL);
if (!comp)
return ERR_PTR(-ENOMEM);
- comp->backend = backend;
+ comp->name = compress;
error = zcomp_init(comp);
if (error) {
kfree(comp);
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index ffd88cb747fe..478cac2ed465 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -13,33 +13,15 @@
struct zcomp_strm {
/* compression/decompression buffer */
void *buffer;
- /*
- * The private data of the compression stream, only compression
- * stream backend can touch this (e.g. compression algorithm
- * working memory)
- */
- void *private;
-};
-
-/* static compression backend */
-struct zcomp_backend {
- int (*compress)(const unsigned char *src, unsigned char *dst,
- size_t *dst_len, void *private);
-
- int (*decompress)(const unsigned char *src, size_t src_len,
- unsigned char *dst);
-
- void *(*create)(gfp_t flags);
- void (*destroy)(void *private);
-
- const char *name;
+ struct crypto_comp *tfm;
};
/* dynamic per-device compression frontend */
struct zcomp {
struct zcomp_strm * __percpu *stream;
- struct zcomp_backend *backend;
struct notifier_block notifier;
+
+ const char *name;
};
ssize_t zcomp_available_show(const char *comp, char *buf);
@@ -48,14 +30,14 @@ bool zcomp_available_algorithm(const char *comp);
struct zcomp *zcomp_create(const char *comp);
void zcomp_destroy(struct zcomp *comp);
-struct zcomp_strm *zcomp_strm_find(struct zcomp *comp);
-void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm);
+struct zcomp_strm *zcomp_stream_get(struct zcomp *comp);
+void zcomp_stream_put(struct zcomp *comp);
-int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
- const unsigned char *src, size_t *dst_len);
+int zcomp_compress(struct zcomp_strm *zstrm,
+ const void *src, unsigned int *dst_len);
-int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
- size_t src_len, unsigned char *dst);
+int zcomp_decompress(struct zcomp_strm *zstrm,
+ const void *src, unsigned int src_len, void *dst);
bool zcomp_set_max_streams(struct zcomp *comp, int num_strm);
#endif /* _ZCOMP_H_ */
diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c
deleted file mode 100644
index 0110086accba..000000000000
--- a/drivers/block/zram/zcomp_lz4.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 2014 Sergey Senozhatsky.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/lz4.h>
-#include <linux/vmalloc.h>
-#include <linux/mm.h>
-
-#include "zcomp_lz4.h"
-
-static void *zcomp_lz4_create(gfp_t flags)
-{
- void *ret;
-
- ret = kmalloc(LZ4_MEM_COMPRESS, flags);
- if (!ret)
- ret = __vmalloc(LZ4_MEM_COMPRESS,
- flags | __GFP_HIGHMEM,
- PAGE_KERNEL);
- return ret;
-}
-
-static void zcomp_lz4_destroy(void *private)
-{
- kvfree(private);
-}
-
-static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst,
- size_t *dst_len, void *private)
-{
- /* return : Success if return 0 */
- return lz4_compress(src, PAGE_SIZE, dst, dst_len, private);
-}
-
-static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len,
- unsigned char *dst)
-{
- size_t dst_len = PAGE_SIZE;
- /* return : Success if return 0 */
- return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len);
-}
-
-struct zcomp_backend zcomp_lz4 = {
- .compress = zcomp_lz4_compress,
- .decompress = zcomp_lz4_decompress,
- .create = zcomp_lz4_create,
- .destroy = zcomp_lz4_destroy,
- .name = "lz4",
-};
diff --git a/drivers/block/zram/zcomp_lz4.h b/drivers/block/zram/zcomp_lz4.h
deleted file mode 100644
index 60613fb29dd8..000000000000
--- a/drivers/block/zram/zcomp_lz4.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) 2014 Sergey Senozhatsky.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _ZCOMP_LZ4_H_
-#define _ZCOMP_LZ4_H_
-
-#include "zcomp.h"
-
-extern struct zcomp_backend zcomp_lz4;
-
-#endif /* _ZCOMP_LZ4_H_ */
diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c
deleted file mode 100644
index ed7a1f0549ec..000000000000
--- a/drivers/block/zram/zcomp_lzo.c
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 2014 Sergey Senozhatsky.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/lzo.h>
-#include <linux/vmalloc.h>
-#include <linux/mm.h>
-
-#include "zcomp_lzo.h"
-
-static void *lzo_create(gfp_t flags)
-{
- void *ret;
-
- ret = kmalloc(LZO1X_MEM_COMPRESS, flags);
- if (!ret)
- ret = __vmalloc(LZO1X_MEM_COMPRESS,
- flags | __GFP_HIGHMEM,
- PAGE_KERNEL);
- return ret;
-}
-
-static void lzo_destroy(void *private)
-{
- kvfree(private);
-}
-
-static int lzo_compress(const unsigned char *src, unsigned char *dst,
- size_t *dst_len, void *private)
-{
- int ret = lzo1x_1_compress(src, PAGE_SIZE, dst, dst_len, private);
- return ret == LZO_E_OK ? 0 : ret;
-}
-
-static int lzo_decompress(const unsigned char *src, size_t src_len,
- unsigned char *dst)
-{
- size_t dst_len = PAGE_SIZE;
- int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len);
- return ret == LZO_E_OK ? 0 : ret;
-}
-
-struct zcomp_backend zcomp_lzo = {
- .compress = lzo_compress,
- .decompress = lzo_decompress,
- .create = lzo_create,
- .destroy = lzo_destroy,
- .name = "lzo",
-};
diff --git a/drivers/block/zram/zcomp_lzo.h b/drivers/block/zram/zcomp_lzo.h
deleted file mode 100644
index 128c5807fa14..000000000000
--- a/drivers/block/zram/zcomp_lzo.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) 2014 Sergey Senozhatsky.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _ZCOMP_LZO_H_
-#define _ZCOMP_LZO_H_
-
-#include "zcomp.h"
-
-extern struct zcomp_backend zcomp_lzo;
-
-#endif /* _ZCOMP_LZO_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 8fcad8b761f1..7454cf188c8e 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -342,9 +342,16 @@ static ssize_t comp_algorithm_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
struct zram *zram = dev_to_zram(dev);
+ char compressor[CRYPTO_MAX_ALG_NAME];
size_t sz;
- if (!zcomp_available_algorithm(buf))
+ strlcpy(compressor, buf, sizeof(compressor));
+ /* ignore trailing newline */
+ sz = strlen(compressor);
+ if (sz > 0 && compressor[sz - 1] == '\n')
+ compressor[sz - 1] = 0x00;
+
+ if (!zcomp_available_algorithm(compressor))
return -EINVAL;
down_write(&zram->init_lock);
@@ -353,13 +360,8 @@ static ssize_t comp_algorithm_store(struct device *dev,
pr_info("Can't change algorithm for initialized device\n");
return -EBUSY;
}
- strlcpy(zram->compressor, buf, sizeof(zram->compressor));
-
- /* ignore trailing newline */
- sz = strlen(zram->compressor);
- if (sz > 0 && zram->compressor[sz - 1] == '\n')
- zram->compressor[sz - 1] = 0x00;
+ strlcpy(zram->compressor, compressor, sizeof(compressor));
up_write(&zram->init_lock);
return len;
}
@@ -563,7 +565,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
unsigned char *cmem;
struct zram_meta *meta = zram->meta;
unsigned long handle;
- size_t size;
+ unsigned int size;
bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
handle = meta->table[index].handle;
@@ -576,10 +578,14 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
}
cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
- if (size == PAGE_SIZE)
+ if (size == PAGE_SIZE) {
copy_page(mem, cmem);
- else
- ret = zcomp_decompress(zram->comp, cmem, size, mem);
+ } else {
+ struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
+
+ ret = zcomp_decompress(zstrm, cmem, size, mem);
+ zcomp_stream_put(zram->comp);
+ }
zs_unmap_object(meta->mem_pool, handle);
bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
@@ -646,7 +652,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
int offset)
{
int ret = 0;
- size_t clen;
+ unsigned int clen;
unsigned long handle = 0;
struct page *page;
unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
@@ -695,8 +701,8 @@ compress_again:
goto out;
}
- zstrm = zcomp_strm_find(zram->comp);
- ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen);
+ zstrm = zcomp_stream_get(zram->comp);
+ ret = zcomp_compress(zstrm, uncmem, &clen);
if (!is_partial_io(bvec)) {
kunmap_atomic(user_mem);
user_mem = NULL;
@@ -732,19 +738,21 @@ compress_again:
handle = zs_malloc(meta->mem_pool, clen,
__GFP_KSWAPD_RECLAIM |
__GFP_NOWARN |
- __GFP_HIGHMEM);
+ __GFP_HIGHMEM |
+ __GFP_MOVABLE);
if (!handle) {
- zcomp_strm_release(zram->comp, zstrm);
+ zcomp_stream_put(zram->comp);
zstrm = NULL;
atomic64_inc(&zram->stats.writestall);
handle = zs_malloc(meta->mem_pool, clen,
- GFP_NOIO | __GFP_HIGHMEM);
+ GFP_NOIO | __GFP_HIGHMEM |
+ __GFP_MOVABLE);
if (handle)
goto compress_again;
- pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
+ pr_err("Error allocating memory for compressed page: %u, size=%u\n",
index, clen);
ret = -ENOMEM;
goto out;
@@ -769,7 +777,7 @@ compress_again:
memcpy(cmem, src, clen);
}
- zcomp_strm_release(zram->comp, zstrm);
+ zcomp_stream_put(zram->comp);
zstrm = NULL;
zs_unmap_object(meta->mem_pool, handle);
@@ -789,7 +797,7 @@ compress_again:
atomic64_inc(&zram->stats.pages_stored);
out:
if (zstrm)
- zcomp_strm_release(zram->comp, zstrm);
+ zcomp_stream_put(zram->comp);
if (is_partial_io(bvec))
kfree(uncmem);
return ret;
@@ -874,7 +882,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
offset = (bio->bi_iter.bi_sector &
(SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
- if (unlikely(bio->bi_rw & REQ_DISCARD)) {
+ if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
zram_bio_discard(zram, index, offset, bio);
bio_endio(bio);
return;
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 3f5bf66a27e4..74fcf10da374 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -15,8 +15,9 @@
#ifndef _ZRAM_DRV_H_
#define _ZRAM_DRV_H_
-#include <linux/spinlock.h>
+#include <linux/rwsem.h>
#include <linux/zsmalloc.h>
+#include <linux/crypto.h>
#include "zcomp.h"
@@ -113,7 +114,7 @@ struct zram {
* we can store in a disk.
*/
u64 disksize; /* bytes */
- char compressor[10];
+ char compressor[CRYPTO_MAX_ALG_NAME];
/*
* zram is claimed so open request will be failed
*/