summaryrefslogtreecommitdiffstats
path: root/drivers/block/xen-blkback
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-22 19:02:52 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-22 19:02:52 -0700
commitd4c90b1b9fe907da0d310008e5a769b591a14399 (patch)
treed37589ab70ada2778d315a0ad24d6e68c8615af6 /drivers/block/xen-blkback
parent3b2f64d00c46e1e4e9bd0bb9bb12619adac27a4b (diff)
parent0878ae2db83a10894724cdeaba7ef9f1ac1c9ac8 (diff)
downloadlinux-d4c90b1b9fe907da0d310008e5a769b591a14399.tar.gz
linux-d4c90b1b9fe907da0d310008e5a769b591a14399.tar.bz2
linux-d4c90b1b9fe907da0d310008e5a769b591a14399.zip
Merge branch 'for-3.11/drivers' of git://git.kernel.dk/linux-block
Pull block IO driver bits from Jens Axboe: "As I mentioned in the core block pull request, due to real life circumstances the driver pull request would be late. Now it looks like -rc2 late... On the plus side, apart form the rsxx update, these are all things that I could argue could go in later in the cycle as they are fixes and not features. So even though things are late, it's not ALL bad. The pull request contains: - Updates to bcache, all bug fixes, from Kent. - A pile of drbd bug fixes (no big features this time!). - xen blk front/back fixes. - rsxx driver updates, some of them deferred form 3.10. So should be well cooked by now" * 'for-3.11/drivers' of git://git.kernel.dk/linux-block: (63 commits) bcache: Allocation kthread fixes bcache: Fix GC_SECTORS_USED() calculation bcache: Journal replay fix bcache: Shutdown fix bcache: Fix a sysfs splat on shutdown bcache: Advertise that flushes are supported bcache: check for allocation failures bcache: Fix a dumb race bcache: Use standard utility code bcache: Update email address bcache: Delete fuzz tester bcache: Document shrinker reserve better bcache: FUA fixes drbd: Allow online change of al-stripes and al-stripe-size drbd: Constants should be UPPERCASE drbd: Ignore the exit code of a fence-peer handler if it returns too late drbd: Fix rcu_read_lock balance on error path drbd: fix error return code in drbd_init() drbd: Do not sleep inside rcu bcache: Refresh usage docs ...
Diffstat (limited to 'drivers/block/xen-blkback')
-rw-r--r--drivers/block/xen-blkback/blkback.c872
-rw-r--r--drivers/block/xen-blkback/common.h147
-rw-r--r--drivers/block/xen-blkback/xenbus.c85
3 files changed, 782 insertions, 322 deletions
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index dd5b2fed97e9..bf4b9d282c04 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -50,110 +50,118 @@
#include "common.h"
/*
- * These are rather arbitrary. They are fairly large because adjacent requests
- * pulled from a communication ring are quite likely to end up being part of
- * the same scatter/gather request at the disc.
+ * Maximum number of unused free pages to keep in the internal buffer.
+ * Setting this to a value too low will reduce memory used in each backend,
+ * but can have a performance penalty.
*
- * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
- *
- * This will increase the chances of being able to write whole tracks.
- * 64 should be enough to keep us competitive with Linux.
+ * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can
+ * be set to a lower value that might degrade performance on some intensive
+ * IO workloads.
*/
-static int xen_blkif_reqs = 64;
-module_param_named(reqs, xen_blkif_reqs, int, 0);
-MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
-/* Run-time switchable: /sys/module/blkback/parameters/ */
-static unsigned int log_stats;
-module_param(log_stats, int, 0644);
+static int xen_blkif_max_buffer_pages = 1024;
+module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644);
+MODULE_PARM_DESC(max_buffer_pages,
+"Maximum number of free pages to keep in each block backend buffer");
/*
- * Each outstanding request that we've passed to the lower device layers has a
- * 'pending_req' allocated to it. Each buffer_head that completes decrements
- * the pendcnt towards zero. When it hits zero, the specified domain has a
- * response queued for it, with the saved 'id' passed back.
+ * Maximum number of grants to map persistently in blkback. For maximum
+ * performance this should be the total numbers of grants that can be used
+ * to fill the ring, but since this might become too high, specially with
+ * the use of indirect descriptors, we set it to a value that provides good
+ * performance without using too much memory.
+ *
+ * When the list of persistent grants is full we clean it up using a LRU
+ * algorithm.
*/
-struct pending_req {
- struct xen_blkif *blkif;
- u64 id;
- int nr_pages;
- atomic_t pendcnt;
- unsigned short operation;
- int status;
- struct list_head free_list;
- DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
-};
-#define BLKBACK_INVALID_HANDLE (~0)
+static int xen_blkif_max_pgrants = 1056;
+module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
+MODULE_PARM_DESC(max_persistent_grants,
+ "Maximum number of grants to map persistently");
-struct xen_blkbk {
- struct pending_req *pending_reqs;
- /* List of all 'pending_req' available */
- struct list_head pending_free;
- /* And its spinlock. */
- spinlock_t pending_free_lock;
- wait_queue_head_t pending_free_wq;
- /* The list of all pages that are available. */
- struct page **pending_pages;
- /* And the grant handles that are available. */
- grant_handle_t *pending_grant_handles;
-};
-
-static struct xen_blkbk *blkbk;
+/*
+ * The LRU mechanism to clean the lists of persistent grants needs to
+ * be executed periodically. The time interval between consecutive executions
+ * of the purge mechanism is set in ms.
+ */
+#define LRU_INTERVAL 100
/*
- * Maximum number of grant pages that can be mapped in blkback.
- * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of
- * pages that blkback will persistently map.
- * Currently, this is:
- * RING_SIZE = 32 (for all known ring types)
- * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11
- * sizeof(struct persistent_gnt) = 48
- * So the maximum memory used to store the grants is:
- * 32 * 11 * 48 = 16896 bytes
+ * When the persistent grants list is full we will remove unused grants
+ * from the list. The percent number of grants to be removed at each LRU
+ * execution.
*/
-static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol)
+#define LRU_PERCENT_CLEAN 5
+
+/* Run-time switchable: /sys/module/blkback/parameters/ */
+static unsigned int log_stats;
+module_param(log_stats, int, 0644);
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+/* Number of free pages to remove on each call to free_xenballooned_pages */
+#define NUM_BATCH_FREE_PAGES 10
+
+static inline int get_free_page(struct xen_blkif *blkif, struct page **page)
{
- switch (protocol) {
- case BLKIF_PROTOCOL_NATIVE:
- return __CONST_RING_SIZE(blkif, PAGE_SIZE) *
- BLKIF_MAX_SEGMENTS_PER_REQUEST;
- case BLKIF_PROTOCOL_X86_32:
- return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) *
- BLKIF_MAX_SEGMENTS_PER_REQUEST;
- case BLKIF_PROTOCOL_X86_64:
- return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) *
- BLKIF_MAX_SEGMENTS_PER_REQUEST;
- default:
- BUG();
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkif->free_pages_lock, flags);
+ if (list_empty(&blkif->free_pages)) {
+ BUG_ON(blkif->free_pages_num != 0);
+ spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+ return alloc_xenballooned_pages(1, page, false);
}
+ BUG_ON(blkif->free_pages_num == 0);
+ page[0] = list_first_entry(&blkif->free_pages, struct page, lru);
+ list_del(&page[0]->lru);
+ blkif->free_pages_num--;
+ spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+
return 0;
}
-
-/*
- * Little helpful macro to figure out the index and virtual address of the
- * pending_pages[..]. For each 'pending_req' we have have up to
- * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through
- * 10 and would index in the pending_pages[..].
- */
-static inline int vaddr_pagenr(struct pending_req *req, int seg)
+static inline void put_free_pages(struct xen_blkif *blkif, struct page **page,
+ int num)
{
- return (req - blkbk->pending_reqs) *
- BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
-}
+ unsigned long flags;
+ int i;
-#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
+ spin_lock_irqsave(&blkif->free_pages_lock, flags);
+ for (i = 0; i < num; i++)
+ list_add(&page[i]->lru, &blkif->free_pages);
+ blkif->free_pages_num += num;
+ spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+}
-static inline unsigned long vaddr(struct pending_req *req, int seg)
+static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num)
{
- unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg));
- return (unsigned long)pfn_to_kaddr(pfn);
-}
+ /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
+ struct page *page[NUM_BATCH_FREE_PAGES];
+ unsigned int num_pages = 0;
+ unsigned long flags;
-#define pending_handle(_req, _seg) \
- (blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)])
+ spin_lock_irqsave(&blkif->free_pages_lock, flags);
+ while (blkif->free_pages_num > num) {
+ BUG_ON(list_empty(&blkif->free_pages));
+ page[num_pages] = list_first_entry(&blkif->free_pages,
+ struct page, lru);
+ list_del(&page[num_pages]->lru);
+ blkif->free_pages_num--;
+ if (++num_pages == NUM_BATCH_FREE_PAGES) {
+ spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+ free_xenballooned_pages(num_pages, page);
+ spin_lock_irqsave(&blkif->free_pages_lock, flags);
+ num_pages = 0;
+ }
+ }
+ spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+ if (num_pages != 0)
+ free_xenballooned_pages(num_pages, page);
+}
+#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
static int do_block_io_op(struct xen_blkif *blkif);
static int dispatch_rw_block_io(struct xen_blkif *blkif,
@@ -170,13 +178,29 @@ static void make_response(struct xen_blkif *blkif, u64 id,
(n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
-static void add_persistent_gnt(struct rb_root *root,
+/*
+ * We don't need locking around the persistent grant helpers
+ * because blkback uses a single-thread for each backed, so we
+ * can be sure that this functions will never be called recursively.
+ *
+ * The only exception to that is put_persistent_grant, that can be called
+ * from interrupt context (by xen_blkbk_unmap), so we have to use atomic
+ * bit operations to modify the flags of a persistent grant and to count
+ * the number of used grants.
+ */
+static int add_persistent_gnt(struct xen_blkif *blkif,
struct persistent_gnt *persistent_gnt)
{
- struct rb_node **new = &(root->rb_node), *parent = NULL;
+ struct rb_node **new = NULL, *parent = NULL;
struct persistent_gnt *this;
+ if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) {
+ if (!blkif->vbd.overflow_max_grants)
+ blkif->vbd.overflow_max_grants = 1;
+ return -EBUSY;
+ }
/* Figure out where to put new node */
+ new = &blkif->persistent_gnts.rb_node;
while (*new) {
this = container_of(*new, struct persistent_gnt, node);
@@ -186,22 +210,28 @@ static void add_persistent_gnt(struct rb_root *root,
else if (persistent_gnt->gnt > this->gnt)
new = &((*new)->rb_right);
else {
- pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n");
- BUG();
+ pr_alert_ratelimited(DRV_PFX " trying to add a gref that's already in the tree\n");
+ return -EINVAL;
}
}
+ bitmap_zero(persistent_gnt->flags, PERSISTENT_GNT_FLAGS_SIZE);
+ set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
/* Add new node and rebalance tree. */
rb_link_node(&(persistent_gnt->node), parent, new);
- rb_insert_color(&(persistent_gnt->node), root);
+ rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts);
+ blkif->persistent_gnt_c++;
+ atomic_inc(&blkif->persistent_gnt_in_use);
+ return 0;
}
-static struct persistent_gnt *get_persistent_gnt(struct rb_root *root,
+static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
grant_ref_t gref)
{
struct persistent_gnt *data;
- struct rb_node *node = root->rb_node;
+ struct rb_node *node = NULL;
+ node = blkif->persistent_gnts.rb_node;
while (node) {
data = container_of(node, struct persistent_gnt, node);
@@ -209,13 +239,31 @@ static struct persistent_gnt *get_persistent_gnt(struct rb_root *root,
node = node->rb_left;
else if (gref > data->gnt)
node = node->rb_right;
- else
+ else {
+ if(test_bit(PERSISTENT_GNT_ACTIVE, data->flags)) {
+ pr_alert_ratelimited(DRV_PFX " requesting a grant already in use\n");
+ return NULL;
+ }
+ set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
+ atomic_inc(&blkif->persistent_gnt_in_use);
return data;
+ }
}
return NULL;
}
-static void free_persistent_gnts(struct rb_root *root, unsigned int num)
+static void put_persistent_gnt(struct xen_blkif *blkif,
+ struct persistent_gnt *persistent_gnt)
+{
+ if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
+ pr_alert_ratelimited(DRV_PFX " freeing a grant already unused");
+ set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
+ clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
+ atomic_dec(&blkif->persistent_gnt_in_use);
+}
+
+static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
+ unsigned int num)
{
struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -240,7 +288,7 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num)
ret = gnttab_unmap_refs(unmap, NULL, pages,
segs_to_unmap);
BUG_ON(ret);
- free_xenballooned_pages(segs_to_unmap, pages);
+ put_free_pages(blkif, pages, segs_to_unmap);
segs_to_unmap = 0;
}
@@ -251,21 +299,148 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num)
BUG_ON(num != 0);
}
+static void unmap_purged_grants(struct work_struct *work)
+{
+ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct persistent_gnt *persistent_gnt;
+ int ret, segs_to_unmap = 0;
+ struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work);
+
+ while(!list_empty(&blkif->persistent_purge_list)) {
+ persistent_gnt = list_first_entry(&blkif->persistent_purge_list,
+ struct persistent_gnt,
+ remove_node);
+ list_del(&persistent_gnt->remove_node);
+
+ gnttab_set_unmap_op(&unmap[segs_to_unmap],
+ vaddr(persistent_gnt->page),
+ GNTMAP_host_map,
+ persistent_gnt->handle);
+
+ pages[segs_to_unmap] = persistent_gnt->page;
+
+ if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+ ret = gnttab_unmap_refs(unmap, NULL, pages,
+ segs_to_unmap);
+ BUG_ON(ret);
+ put_free_pages(blkif, pages, segs_to_unmap);
+ segs_to_unmap = 0;
+ }
+ kfree(persistent_gnt);
+ }
+ if (segs_to_unmap > 0) {
+ ret = gnttab_unmap_refs(unmap, NULL, pages, segs_to_unmap);
+ BUG_ON(ret);
+ put_free_pages(blkif, pages, segs_to_unmap);
+ }
+}
+
+static void purge_persistent_gnt(struct xen_blkif *blkif)
+{
+ struct persistent_gnt *persistent_gnt;
+ struct rb_node *n;
+ unsigned int num_clean, total;
+ bool scan_used = false, clean_used = false;
+ struct rb_root *root;
+
+ if (blkif->persistent_gnt_c < xen_blkif_max_pgrants ||
+ (blkif->persistent_gnt_c == xen_blkif_max_pgrants &&
+ !blkif->vbd.overflow_max_grants)) {
+ return;
+ }
+
+ if (work_pending(&blkif->persistent_purge_work)) {
+ pr_alert_ratelimited(DRV_PFX "Scheduled work from previous purge is still pending, cannot purge list\n");
+ return;
+ }
+
+ num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
+ num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
+ num_clean = min(blkif->persistent_gnt_c, num_clean);
+ if ((num_clean == 0) ||
+ (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use))))
+ return;
+
+ /*
+ * At this point, we can assure that there will be no calls
+ * to get_persistent_grant (because we are executing this code from
+ * xen_blkif_schedule), there can only be calls to put_persistent_gnt,
+ * which means that the number of currently used grants will go down,
+ * but never up, so we will always be able to remove the requested
+ * number of grants.
+ */
+
+ total = num_clean;
+
+ pr_debug(DRV_PFX "Going to purge %u persistent grants\n", num_clean);
+
+ INIT_LIST_HEAD(&blkif->persistent_purge_list);
+ root = &blkif->persistent_gnts;
+purge_list:
+ foreach_grant_safe(persistent_gnt, n, root, node) {
+ BUG_ON(persistent_gnt->handle ==
+ BLKBACK_INVALID_HANDLE);
+
+ if (clean_used) {
+ clear_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
+ continue;
+ }
+
+ if (test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
+ continue;
+ if (!scan_used &&
+ (test_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags)))
+ continue;
+
+ rb_erase(&persistent_gnt->node, root);
+ list_add(&persistent_gnt->remove_node,
+ &blkif->persistent_purge_list);
+ if (--num_clean == 0)
+ goto finished;
+ }
+ /*
+ * If we get here it means we also need to start cleaning
+ * grants that were used since last purge in order to cope
+ * with the requested num
+ */
+ if (!scan_used && !clean_used) {
+ pr_debug(DRV_PFX "Still missing %u purged frames\n", num_clean);
+ scan_used = true;
+ goto purge_list;
+ }
+finished:
+ if (!clean_used) {
+ pr_debug(DRV_PFX "Finished scanning for grants to clean, removing used flag\n");
+ clean_used = true;
+ goto purge_list;
+ }
+
+ blkif->persistent_gnt_c -= (total - num_clean);
+ blkif->vbd.overflow_max_grants = 0;
+
+ /* We can defer this work */
+ INIT_WORK(&blkif->persistent_purge_work, unmap_purged_grants);
+ schedule_work(&blkif->persistent_purge_work);
+ pr_debug(DRV_PFX "Purged %u/%u\n", (total - num_clean), total);
+ return;
+}
+
/*
* Retrieve from the 'pending_reqs' a free pending_req structure to be used.
*/
-static struct pending_req *alloc_req(void)
+static struct pending_req *alloc_req(struct xen_blkif *blkif)
{
struct pending_req *req = NULL;
unsigned long flags;
- spin_lock_irqsave(&blkbk->pending_free_lock, flags);
- if (!list_empty(&blkbk->pending_free)) {
- req = list_entry(blkbk->pending_free.next, struct pending_req,
+ spin_lock_irqsave(&blkif->pending_free_lock, flags);
+ if (!list_empty(&blkif->pending_free)) {
+ req = list_entry(blkif->pending_free.next, struct pending_req,
free_list);
list_del(&req->free_list);
}
- spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
+ spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
return req;
}
@@ -273,17 +448,17 @@ static struct pending_req *alloc_req(void)
* Return the 'pending_req' structure back to the freepool. We also
* wake up the thread if it was waiting for a free page.
*/
-static void free_req(struct pending_req *req)
+static void free_req(struct xen_blkif *blkif, struct pending_req *req)
{
unsigned long flags;
int was_empty;
- spin_lock_irqsave(&blkbk->pending_free_lock, flags);
- was_empty = list_empty(&blkbk->pending_free);
- list_add(&req->free_list, &blkbk->pending_free);
- spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
+ spin_lock_irqsave(&blkif->pending_free_lock, flags);
+ was_empty = list_empty(&blkif->pending_free);
+ list_add(&req->free_list, &blkif->pending_free);
+ spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
if (was_empty)
- wake_up(&blkbk->pending_free_wq);
+ wake_up(&blkif->pending_free_wq);
}
/*
@@ -382,10 +557,12 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
static void print_stats(struct xen_blkif *blkif)
{
pr_info("xen-blkback (%s): oo %3llu | rd %4llu | wr %4llu | f %4llu"
- " | ds %4llu\n",
+ " | ds %4llu | pg: %4u/%4d\n",
current->comm, blkif->st_oo_req,
blkif->st_rd_req, blkif->st_wr_req,
- blkif->st_f_req, blkif->st_ds_req);
+ blkif->st_f_req, blkif->st_ds_req,
+ blkif->persistent_gnt_c,
+ xen_blkif_max_pgrants);
blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
blkif->st_rd_req = 0;
blkif->st_wr_req = 0;
@@ -397,6 +574,8 @@ int xen_blkif_schedule(void *arg)
{
struct xen_blkif *blkif = arg;
struct xen_vbd *vbd = &blkif->vbd;
+ unsigned long timeout;
+ int ret;
xen_blkif_get(blkif);
@@ -406,27 +585,52 @@ int xen_blkif_schedule(void *arg)
if (unlikely(vbd->size != vbd_sz(vbd)))
xen_vbd_resize(blkif);
- wait_event_interruptible(
+ timeout = msecs_to_jiffies(LRU_INTERVAL);
+
+ timeout = wait_event_interruptible_timeout(
blkif->wq,
- blkif->waiting_reqs || kthread_should_stop());
- wait_event_interruptible(
- blkbk->pending_free_wq,
- !list_empty(&blkbk->pending_free) ||
- kthread_should_stop());
+ blkif->waiting_reqs || kthread_should_stop(),
+ timeout);
+ if (timeout == 0)
+ goto purge_gnt_list;
+ timeout = wait_event_interruptible_timeout(
+ blkif->pending_free_wq,
+ !list_empty(&blkif->pending_free) ||
+ kthread_should_stop(),
+ timeout);
+ if (timeout == 0)
+ goto purge_gnt_list;
blkif->waiting_reqs = 0;
smp_mb(); /* clear flag *before* checking for work */
- if (do_block_io_op(blkif))
+ ret = do_block_io_op(blkif);
+ if (ret > 0)
blkif->waiting_reqs = 1;
+ if (ret == -EACCES)
+ wait_event_interruptible(blkif->shutdown_wq,
+ kthread_should_stop());
+
+purge_gnt_list:
+ if (blkif->vbd.feature_gnt_persistent &&
+ time_after(jiffies, blkif->next_lru)) {
+ purge_persistent_gnt(blkif);
+ blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
+ }
+
+ /* Shrink if we have more than xen_blkif_max_buffer_pages */
+ shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
if (log_stats && time_after(jiffies, blkif->st_print))
print_stats(blkif);
}
+ /* Since we are shutting down remove all pages from the buffer */
+ shrink_free_pagepool(blkif, 0 /* All */);
+
/* Free all persistent grant pages */
if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
- free_persistent_gnts(&blkif->persistent_gnts,
+ free_persistent_gnts(blkif, &blkif->persistent_gnts,
blkif->persistent_gnt_c);
BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
@@ -441,148 +645,98 @@ int xen_blkif_schedule(void *arg)
return 0;
}
-struct seg_buf {
- unsigned int offset;
- unsigned int nsec;
-};
/*
* Unmap the grant references, and also remove the M2P over-rides
* used in the 'pending_req'.
*/
-static void xen_blkbk_unmap(struct pending_req *req)
+static void xen_blkbk_unmap(struct xen_blkif *blkif,
+ struct grant_page *pages[],
+ int num)
{
struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
- struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
unsigned int i, invcount = 0;
- grant_handle_t handle;
int ret;
- for (i = 0; i < req->nr_pages; i++) {
- if (!test_bit(i, req->unmap_seg))
+ for (i = 0; i < num; i++) {
+ if (pages[i]->persistent_gnt != NULL) {
+ put_persistent_gnt(blkif, pages[i]->persistent_gnt);
continue;
- handle = pending_handle(req, i);
- if (handle == BLKBACK_INVALID_HANDLE)
+ }
+ if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
continue;
- gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
- GNTMAP_host_map, handle);
- pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
- pages[invcount] = virt_to_page(vaddr(req, i));
- invcount++;
+ unmap_pages[invcount] = pages[i]->page;
+ gnttab_set_unmap_op(&unmap[invcount], vaddr(pages[i]->page),
+ GNTMAP_host_map, pages[i]->handle);
+ pages[i]->handle = BLKBACK_INVALID_HANDLE;
+ if (++invcount == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+ ret = gnttab_unmap_refs(unmap, NULL, unmap_pages,
+ invcount);
+ BUG_ON(ret);
+ put_free_pages(blkif, unmap_pages, invcount);
+ invcount = 0;
+ }
+ }
+ if (invcount) {
+ ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
+ BUG_ON(ret);
+ put_free_pages(blkif, unmap_pages, invcount);
}
-
- ret = gnttab_unmap_refs(unmap, NULL, pages, invcount);
- BUG_ON(ret);
}
-static int xen_blkbk_map(struct blkif_request *req,
- struct pending_req *pending_req,
- struct seg_buf seg[],
- struct page *pages[])
+static int xen_blkbk_map(struct xen_blkif *blkif,
+ struct grant_page *pages[],
+ int num, bool ro)
{
struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
- struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct persistent_gnt *persistent_gnt = NULL;
- struct xen_blkif *blkif = pending_req->blkif;
phys_addr_t addr = 0;
- int i, j;
- bool new_map;
- int nseg = req->u.rw.nr_segments;
+ int i, seg_idx, new_map_idx;
int segs_to_map = 0;
int ret = 0;
+ int last_map = 0, map_until = 0;
int use_persistent_gnts;
use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
- BUG_ON(blkif->persistent_gnt_c >
- max_mapped_grant_pages(pending_req->blkif->blk_protocol));
-
/*
* Fill out preq.nr_sects with proper amount of sectors, and setup
* assign map[..] with the PFN of the page in our domain with the
* corresponding grant reference for each page.
*/
- for (i = 0; i < nseg; i++) {
+again:
+ for (i = map_until; i < num; i++) {
uint32_t flags;
if (use_persistent_gnts)
persistent_gnt = get_persistent_gnt(
- &blkif->persistent_gnts,
- req->u.rw.seg[i].gref);
+ blkif,
+ pages[i]->gref);
if (persistent_gnt) {
/*
* We are using persistent grants and
* the grant is already mapped
*/
- new_map = false;
- } else if (use_persistent_gnts &&
- blkif->persistent_gnt_c <
- max_mapped_grant_pages(blkif->blk_protocol)) {
- /*
- * We are using persistent grants, the grant is
- * not mapped but we have room for it
- */
- new_map = true;
- persistent_gnt = kmalloc(
- sizeof(struct persistent_gnt),
- GFP_KERNEL);
- if (!persistent_gnt)
- return -ENOMEM;
- if (alloc_xenballooned_pages(1, &persistent_gnt->page,
- false)) {
- kfree(persistent_gnt);
- return -ENOMEM;
- }
- persistent_gnt->gnt = req->u.rw.seg[i].gref;
- persistent_gnt->handle = BLKBACK_INVALID_HANDLE;
-
- pages_to_gnt[segs_to_map] =
- persistent_gnt->page;
- addr = (unsigned long) pfn_to_kaddr(
- page_to_pfn(persistent_gnt->page));
-
- add_persistent_gnt(&blkif->persistent_gnts,
- persistent_gnt);
- blkif->persistent_gnt_c++;
- pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
- persistent_gnt->gnt, blkif->persistent_gnt_c,
- max_mapped_grant_pages(blkif->blk_protocol));
+ pages[i]->page = persistent_gnt->page;
+ pages[i]->persistent_gnt = persistent_gnt;
} else {
- /*
- * We are either using persistent grants and
- * hit the maximum limit of grants mapped,
- * or we are not using persistent grants.
- */
- if (use_persistent_gnts &&
- !blkif->vbd.overflow_max_grants) {
- blkif->vbd.overflow_max_grants = 1;
- pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
- blkif->domid, blkif->vbd.handle);
- }
- new_map = true;
- pages[i] = blkbk->pending_page(pending_req, i);
- addr = vaddr(pending_req, i);
- pages_to_gnt[segs_to_map] =
- blkbk->pending_page(pending_req, i);
- }
-
- if (persistent_gnt) {
- pages[i] = persistent_gnt->page;
- persistent_gnts[i] = persistent_gnt;
- } else {
- persistent_gnts[i] = NULL;
- }
-
- if (new_map) {
+ if (get_free_page(blkif, &pages[i]->page))
+ goto out_of_memory;
+ addr = vaddr(pages[i]->page);
+ pages_to_gnt[segs_to_map] = pages[i]->page;
+ pages[i]->persistent_gnt = NULL;
flags = GNTMAP_host_map;
- if (!persistent_gnt &&
- (pending_req->operation != BLKIF_OP_READ))
+ if (!use_persistent_gnts && ro)
flags |= GNTMAP_readonly;
gnttab_set_map_op(&map[segs_to_map++], addr,
- flags, req->u.rw.seg[i].gref,
+ flags, pages[i]->gref,
blkif->domid);
}
+ map_until = i + 1;
+ if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST)
+ break;
}
if (segs_to_map) {
@@ -595,49 +749,133 @@ static int xen_blkbk_map(struct blkif_request *req,
* so that when we access vaddr(pending_req,i) it has the contents of
* the page from the other domain.
*/
- bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
- for (i = 0, j = 0; i < nseg; i++) {
- if (!persistent_gnts[i] ||
- persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) {
+ for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) {
+ if (!pages[seg_idx]->persistent_gnt) {
/* This is a newly mapped grant */
- BUG_ON(j >= segs_to_map);
- if (unlikely(map[j].status != 0)) {
+ BUG_ON(new_map_idx >= segs_to_map);
+ if (unlikely(map[new_map_idx].status != 0)) {
pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
- map[j].handle = BLKBACK_INVALID_HANDLE;
+ pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
ret |= 1;
- if (persistent_gnts[i]) {
- rb_erase(&persistent_gnts[i]->node,
- &blkif->persistent_gnts);
- blkif->persistent_gnt_c--;
- kfree(persistent_gnts[i]);
- persistent_gnts[i] = NULL;
- }
+ goto next;
}
+ pages[seg_idx]->handle = map[new_map_idx].handle;
+ } else {
+ continue;
}
- if (persistent_gnts[i]) {
- if (persistent_gnts[i]->handle ==
- BLKBACK_INVALID_HANDLE) {
+ if (use_persistent_gnts &&
+ blkif->persistent_gnt_c < xen_blkif_max_pgrants) {
+ /*
+ * We are using persistent grants, the grant is
+ * not mapped but we might have room for it.
+ */
+ persistent_gnt = kmalloc(sizeof(struct persistent_gnt),
+ GFP_KERNEL);
+ if (!persistent_gnt) {
/*
- * If this is a new persistent grant
- * save the handler
+ * If we don't have enough memory to
+ * allocate the persistent_gnt struct
+ * map this grant non-persistenly
*/
- persistent_gnts[i]->handle = map[j++].handle;
+ goto next;
}
- pending_handle(pending_req, i) =
- persistent_gnts[i]->handle;
+ persistent_gnt->gnt = map[new_map_idx].ref;
+ persistent_gnt->handle = map[new_map_idx].handle;
+ persistent_gnt->page = pages[seg_idx]->page;
+ if (add_persistent_gnt(blkif,
+ persistent_gnt)) {
+ kfree(persistent_gnt);
+ persistent_gnt = NULL;
+ goto next;
+ }
+ pages[seg_idx]->persistent_gnt = persistent_gnt;
+ pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
+ persistent_gnt->gnt, blkif->persistent_gnt_c,
+ xen_blkif_max_pgrants);
+ goto next;
+ }
+ if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
+ blkif->vbd.overflow_max_grants = 1;
+ pr_debug(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
+ blkif->domid, blkif->vbd.handle);
+ }
+ /*
+ * We could not map this grant persistently, so use it as
+ * a non-persistent grant.
+ */
+next:
+ new_map_idx++;
+ }
+ segs_to_map = 0;
+ last_map = map_until;
+ if (map_until != num)
+ goto again;
- if (ret)
- continue;
- } else {
- pending_handle(pending_req, i) = map[j++].handle;
- bitmap_set(pending_req->unmap_seg, i, 1);
+ return ret;
+
+out_of_memory:
+ pr_alert(DRV_PFX "%s: out of memory\n", __func__);
+ put_free_pages(blkif, pages_to_gnt, segs_to_map);
+ return -ENOMEM;
+}
+
+static int xen_blkbk_map_seg(struct pending_req *pending_req)
+{
+ int rc;
+
+ rc = xen_blkbk_map(pending_req->blkif, pending_req->segments,
+ pending_req->nr_pages,
+ (pending_req->operation != BLKIF_OP_READ));
+
+ return rc;
+}
- if (ret)
- continue;
+static int xen_blkbk_parse_indirect(struct blkif_request *req,
+ struct pending_req *pending_req,
+ struct seg_buf seg[],
+ struct phys_req *preq)
+{
+ struct grant_page **pages = pending_req->indirect_pages;
+ struct xen_blkif *blkif = pending_req->blkif;
+ int indirect_grefs, rc, n, nseg, i;
+ struct blkif_request_segment_aligned *segments = NULL;
+
+ nseg = pending_req->nr_pages;
+ indirect_grefs = INDIRECT_PAGES(nseg);
+ BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
+
+ for (i = 0; i < indirect_grefs; i++)
+ pages[i]->gref = req->u.indirect.indirect_grefs[i];
+
+ rc = xen_blkbk_map(blkif, pages, indirect_grefs, true);
+ if (rc)
+ goto unmap;
+
+ for (n = 0, i = 0; n < nseg; n++) {
+ if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
+ /* Map indirect segments */
+ if (segments)
+ kunmap_atomic(segments);
+ segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page);
+ }
+ i = n % SEGS_PER_INDIRECT_FRAME;
+ pending_req->segments[n]->gref = segments[i].gref;
+ seg[n].nsec = segments[i].last_sect -
+ segments[i].first_sect + 1;
+ seg[n].offset = (segments[i].first_sect << 9);
+ if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) ||
+ (segments[i].last_sect < segments[i].first_sect)) {
+ rc = -EINVAL;
+ goto unmap;
}
- seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
+ preq->nr_sects += seg[n].nsec;
}
- return ret;
+
+unmap:
+ if (segments)
+ kunmap_atomic(segments);
+ xen_blkbk_unmap(blkif, pages, indirect_grefs);
+ return rc;
}
static int dispatch_discard_io(struct xen_blkif *blkif,
@@ -647,7 +885,18 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
int status = BLKIF_RSP_OKAY;
struct block_device *bdev = blkif->vbd.bdev;
unsigned long secure;
+ struct phys_req preq;
+
+ preq.sector_number = req->u.discard.sector_number;
+ preq.nr_sects = req->u.discard.nr_sectors;
+ err = xen_vbd_translate(&preq, blkif, WRITE);
+ if (err) {
+ pr_warn(DRV_PFX "access denied: DISCARD [%llu->%llu] on dev=%04x\n",
+ preq.sector_number,
+ preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
+ goto fail_response;
+ }
blkif->st_ds_req++;
xen_blkif_get(blkif);
@@ -658,7 +907,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
req->u.discard.nr_sectors,
GFP_KERNEL, secure);
-
+fail_response:
if (err == -EOPNOTSUPP) {
pr_debug(DRV_PFX "discard op failed, not supported\n");
status = BLKIF_RSP_EOPNOTSUPP;
@@ -674,7 +923,7 @@ static int dispatch_other_io(struct xen_blkif *blkif,
struct blkif_request *req,
struct pending_req *pending_req)
{
- free_req(pending_req);
+ free_req(blkif, pending_req);
make_response(blkif, req->u.other.id, req->operation,
BLKIF_RSP_EOPNOTSUPP);
return -EIO;
@@ -726,7 +975,9 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
* the proper response on the ring.
*/
if (atomic_dec_and_test(&pending_req->pendcnt)) {
- xen_blkbk_unmap(pending_req);
+ xen_blkbk_unmap(pending_req->blkif,
+ pending_req->segments,
+ pending_req->nr_pages);
make_response(pending_req->blkif, pending_req->id,
pending_req->operation, pending_req->status);
xen_blkif_put(pending_req->blkif);
@@ -734,7 +985,7 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
if (atomic_read(&pending_req->blkif->drain))
complete(&pending_req->blkif->drain_complete);
}
- free_req(pending_req);
+ free_req(pending_req->blkif, pending_req);
}
}
@@ -767,6 +1018,12 @@ __do_block_io_op(struct xen_blkif *blkif)
rp = blk_rings->common.sring->req_prod;
rmb(); /* Ensure we see queued requests up to 'rp'. */
+ if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
+ rc = blk_rings->common.rsp_prod_pvt;
+ pr_warn(DRV_PFX "Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
+ rp, rc, rp - rc, blkif->vbd.pdevice);
+ return -EACCES;
+ }
while (rc != rp) {
if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
@@ -777,7 +1034,7 @@ __do_block_io_op(struct xen_blkif *blkif)
break;
}
- pending_req = alloc_req();
+ pending_req = alloc_req(blkif);
if (NULL == pending_req) {
blkif->st_oo_req++;
more_to_do = 1;
@@ -807,11 +1064,12 @@ __do_block_io_op(struct xen_blkif *blkif)
case BLKIF_OP_WRITE:
case BLKIF_OP_WRITE_BARRIER:
case BLKIF_OP_FLUSH_DISKCACHE:
+ case BLKIF_OP_INDIRECT:
if (dispatch_rw_block_io(blkif, &req, pending_req))
goto done;
break;
case BLKIF_OP_DISCARD:
- free_req(pending_req);
+ free_req(blkif, pending_req);
if (dispatch_discard_io(blkif, &req))
goto done;
break;
@@ -853,17 +1111,28 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
struct pending_req *pending_req)
{
struct phys_req preq;
- struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct seg_buf *seg = pending_req->seg;
unsigned int nseg;
struct bio *bio = NULL;
- struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct bio **biolist = pending_req->biolist;
int i, nbio = 0;
int operation;
struct blk_plug plug;
bool drain = false;
- struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct grant_page **pages = pending_req->segments;
+ unsigned short req_operation;
+
+ req_operation = req->operation == BLKIF_OP_INDIRECT ?
+ req->u.indirect.indirect_op : req->operation;
+ if ((req->operation == BLKIF_OP_INDIRECT) &&
+ (req_operation != BLKIF_OP_READ) &&
+ (req_operation != BLKIF_OP_WRITE)) {
+ pr_debug(DRV_PFX "Invalid indirect operation (%u)\n",
+ req_operation);
+ goto fail_response;
+ }
- switch (req->operation) {
+ switch (req_operation) {
case BLKIF_OP_READ:
blkif->st_rd_req++;
operation = READ;
@@ -885,33 +1154,47 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
}
/* Check that the number of segments is sane. */
- nseg = req->u.rw.nr_segments;
+ nseg = req->operation == BLKIF_OP_INDIRECT ?
+ req->u.indirect.nr_segments : req->u.rw.nr_segments;
if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
- unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+ unlikely((req->operation != BLKIF_OP_INDIRECT) &&
+ (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
+ unlikely((req->operation == BLKIF_OP_INDIRECT) &&
+ (nseg > MAX_INDIRECT_SEGMENTS))) {
pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
nseg);
/* Haven't submitted any bio's yet. */
goto fail_response;
}
- preq.sector_number = req->u.rw.sector_number;
preq.nr_sects = 0;
pending_req->blkif = blkif;
pending_req->id = req->u.rw.id;
- pending_req->operation = req->operation;
+ pending_req->operation = req_operation;
pending_req->status = BLKIF_RSP_OKAY;
pending_req->nr_pages = nseg;
- for (i = 0; i < nseg; i++) {
- seg[i].nsec = req->u.rw.seg[i].last_sect -
- req->u.rw.seg[i].first_sect + 1;
- if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
- (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
+ if (req->operation != BLKIF_OP_INDIRECT) {
+ preq.dev = req->u.rw.handle;
+ preq.sector_number = req->u.rw.sector_number;
+ for (i = 0; i < nseg; i++) {
+ pages[i]->gref = req->u.rw.seg[i].gref;
+ seg[i].nsec = req->u.rw.seg[i].last_sect -
+ req->u.rw.seg[i].first_sect + 1;
+ seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
+ if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
+ (req->u.rw.seg[i].last_sect <
+ req->u.rw.seg[i].first_sect))
+ goto fail_response;
+ preq.nr_sects += seg[i].nsec;
+ }
+ } else {
+ preq.dev = req->u.indirect.handle;
+ preq.sector_number = req->u.indirect.sector_number;
+ if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
goto fail_response;
- preq.nr_sects += seg[i].nsec;
-
}
if (xen_vbd_translate(&preq, blkif, operation) != 0) {
@@ -948,7 +1231,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
* the hypercall to unmap the grants - that is all done in
* xen_blkbk_unmap.
*/
- if (xen_blkbk_map(req, pending_req, seg, pages))
+ if (xen_blkbk_map_seg(pending_req))
goto fail_flush;
/*
@@ -960,11 +1243,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
for (i = 0; i < nseg; i++) {
while ((bio == NULL) ||
(bio_add_page(bio,
- pages[i],
+ pages[i]->page,
seg[i].nsec << 9,
seg[i].offset) == 0)) {
- bio = bio_alloc(GFP_KERNEL, nseg-i);
+ int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES);
+ bio = bio_alloc(GFP_KERNEL, nr_iovecs);
if (unlikely(bio == NULL))
goto fail_put_bio;
@@ -1009,11 +1293,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
return 0;
fail_flush:
- xen_blkbk_unmap(pending_req);
+ xen_blkbk_unmap(blkif, pending_req->segments,
+ pending_req->nr_pages);
fail_response:
/* Haven't submitted any bio's yet. */
- make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR);
- free_req(pending_req);
+ make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
+ free_req(blkif, pending_req);
msleep(1); /* back off a bit */
return -EIO;
@@ -1070,73 +1355,20 @@ static void make_response(struct xen_blkif *blkif, u64 id,
static int __init xen_blkif_init(void)
{
- int i, mmap_pages;
int rc = 0;
if (!xen_domain())
return -ENODEV;
- blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
- if (!blkbk) {
- pr_alert(DRV_PFX "%s: out of memory!\n", __func__);
- return -ENOMEM;
- }
-
- mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
-
- blkbk->pending_reqs = kzalloc(sizeof(blkbk->pending_reqs[0]) *
- xen_blkif_reqs, GFP_KERNEL);
- blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) *
- mmap_pages, GFP_KERNEL);
- blkbk->pending_pages = kzalloc(sizeof(blkbk->pending_pages[0]) *
- mmap_pages, GFP_KERNEL);
-
- if (!blkbk->pending_reqs || !blkbk->pending_grant_handles ||
- !blkbk->pending_pages) {
- rc = -ENOMEM;
- goto out_of_memory;
- }
-
- for (i = 0; i < mmap_pages; i++) {
- blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
- blkbk->pending_pages[i] = alloc_page(GFP_KERNEL);
- if (blkbk->pending_pages[i] == NULL) {
- rc = -ENOMEM;
- goto out_of_memory;
- }
- }
rc = xen_blkif_interface_init();
if (rc)
goto failed_init;
- INIT_LIST_HEAD(&blkbk->pending_free);
- spin_lock_init(&blkbk->pending_free_lock);
- init_waitqueue_head(&blkbk->pending_free_wq);
-
- for (i = 0; i < xen_blkif_reqs; i++)
- list_add_tail(&blkbk->pending_reqs[i].free_list,
- &blkbk->pending_free);
-
rc = xen_blkif_xenbus_init();
if (rc)
goto failed_init;
- return 0;
-
- out_of_memory:
- pr_alert(DRV_PFX "%s: out of memory\n", __func__);
failed_init:
- kfree(blkbk->pending_reqs);
- kfree(blkbk->pending_grant_handles);
- if (blkbk->pending_pages) {
- for (i = 0; i < mmap_pages; i++) {
- if (blkbk->pending_pages[i])
- __free_page(blkbk->pending_pages[i]);
- }
- kfree(blkbk->pending_pages);
- }
- kfree(blkbk);
- blkbk = NULL;
return rc;
}
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 60103e2517ba..8d8807563d99 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -50,6 +50,19 @@
__func__, __LINE__, ##args)
+/*
+ * This is the maximum number of segments that would be allowed in indirect
+ * requests. This value will also be passed to the frontend.
+ */
+#define MAX_INDIRECT_SEGMENTS 256
+
+#define SEGS_PER_INDIRECT_FRAME \
+ (PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
+#define MAX_INDIRECT_PAGES \
+ ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
+#define INDIRECT_PAGES(_segs) \
+ ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
+
/* Not a real protocol. Used to generate ring structs which contain
* the elements common to all protocols only. This way we get a
* compiler-checkable way to use common struct elements, so we can
@@ -83,12 +96,31 @@ struct blkif_x86_32_request_other {
uint64_t id; /* private guest value, echoed in resp */
} __attribute__((__packed__));
+struct blkif_x86_32_request_indirect {
+ uint8_t indirect_op;
+ uint16_t nr_segments;
+ uint64_t id;
+ blkif_sector_t sector_number;
+ blkif_vdev_t handle;
+ uint16_t _pad1;
+ grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
+ /*
+ * The maximum number of indirect segments (and pages) that will
+ * be used is determined by MAX_INDIRECT_SEGMENTS, this value
+ * is also exported to the guest (via xenstore
+ * feature-max-indirect-segments entry), so the frontend knows how
+ * many indirect segments the backend supports.
+ */
+ uint64_t _pad2; /* make it 64 byte aligned */
+} __attribute__((__packed__));
+
struct blkif_x86_32_request {
uint8_t operation; /* BLKIF_OP_??? */
union {
struct blkif_x86_32_request_rw rw;
struct blkif_x86_32_request_discard discard;
struct blkif_x86_32_request_other other;
+ struct blkif_x86_32_request_indirect indirect;
} u;
} __attribute__((__packed__));
@@ -127,12 +159,32 @@ struct blkif_x86_64_request_other {
uint64_t id; /* private guest value, echoed in resp */
} __attribute__((__packed__));
+struct blkif_x86_64_request_indirect {
+ uint8_t indirect_op;
+ uint16_t nr_segments;
+ uint32_t _pad1; /* offsetof(blkif_..,u.indirect.id)==8 */
+ uint64_t id;
+ blkif_sector_t sector_number;
+ blkif_vdev_t handle;
+ uint16_t _pad2;
+ grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
+ /*
+ * The maximum number of indirect segments (and pages) that will
+ * be used is determined by MAX_INDIRECT_SEGMENTS, this value
+ * is also exported to the guest (via xenstore
+ * feature-max-indirect-segments entry), so the frontend knows how
+ * many indirect segments the backend supports.
+ */
+ uint32_t _pad3; /* make it 64 byte aligned */
+} __attribute__((__packed__));
+
struct blkif_x86_64_request {
uint8_t operation; /* BLKIF_OP_??? */
union {
struct blkif_x86_64_request_rw rw;
struct blkif_x86_64_request_discard discard;
struct blkif_x86_64_request_other other;
+ struct blkif_x86_64_request_indirect indirect;
} u;
} __attribute__((__packed__));
@@ -182,12 +234,26 @@ struct xen_vbd {
struct backend_info;
+/* Number of available flags */
+#define PERSISTENT_GNT_FLAGS_SIZE 2
+/* This persistent grant is currently in use */
+#define PERSISTENT_GNT_ACTIVE 0
+/*
+ * This persistent grant has been used, this flag is set when we remove the
+ * PERSISTENT_GNT_ACTIVE, to know that this grant has been used recently.
+ */
+#define PERSISTENT_GNT_WAS_ACTIVE 1
+
+/* Number of requests that we can fit in a ring */
+#define XEN_BLKIF_REQS 32
struct persistent_gnt {
struct page *page;
grant_ref_t gnt;
grant_handle_t handle;
+ DECLARE_BITMAP(flags, PERSISTENT_GNT_FLAGS_SIZE);
struct rb_node node;
+ struct list_head remove_node;
};
struct xen_blkif {
@@ -219,6 +285,23 @@ struct xen_blkif {
/* tree to store persistent grants */
struct rb_root persistent_gnts;
unsigned int persistent_gnt_c;
+ atomic_t persistent_gnt_in_use;
+ unsigned long next_lru;
+
+ /* used by the kworker that offload work from the persistent purge */
+ struct list_head persistent_purge_list;
+ struct work_struct persistent_purge_work;
+
+ /* buffer of free pages to map grant refs */
+ spinlock_t free_pages_lock;
+ int free_pages_num;
+ struct list_head free_pages;
+
+ /* List of all 'pending_req' available */
+ struct list_head pending_free;
+ /* And its spinlock. */
+ spinlock_t pending_free_lock;
+ wait_queue_head_t pending_free_wq;
/* statistics */
unsigned long st_print;
@@ -231,6 +314,41 @@ struct xen_blkif {
unsigned long long st_wr_sect;
wait_queue_head_t waiting_to_free;
+ /* Thread shutdown wait queue. */
+ wait_queue_head_t shutdown_wq;
+};
+
+struct seg_buf {
+ unsigned long offset;
+ unsigned int nsec;
+};
+
+struct grant_page {
+ struct page *page;
+ struct persistent_gnt *persistent_gnt;
+ grant_handle_t handle;
+ grant_ref_t gref;
+};
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
+ * the pendcnt towards zero. When it hits zero, the specified domain has a
+ * response queued for it, with the saved 'id' passed back.
+ */
+struct pending_req {
+ struct xen_blkif *blkif;
+ u64 id;
+ int nr_pages;
+ atomic_t pendcnt;
+ unsigned short operation;
+ int status;
+ struct list_head free_list;
+ struct grant_page *segments[MAX_INDIRECT_SEGMENTS];
+ /* Indirect descriptors */
+ struct grant_page *indirect_pages[MAX_INDIRECT_PAGES];
+ struct seg_buf seg[MAX_INDIRECT_SEGMENTS];
+ struct bio *biolist[MAX_INDIRECT_SEGMENTS];
};
@@ -257,6 +375,7 @@ int xen_blkif_xenbus_init(void);
irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
int xen_blkif_schedule(void *arg);
+int xen_blkif_purge_persistent(void *arg);
int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
struct backend_info *be, int state);
@@ -268,7 +387,7 @@ struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
static inline void blkif_get_x86_32_req(struct blkif_request *dst,
struct blkif_x86_32_request *src)
{
- int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
dst->operation = src->operation;
switch (src->operation) {
case BLKIF_OP_READ:
@@ -291,6 +410,18 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
dst->u.discard.sector_number = src->u.discard.sector_number;
dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
break;
+ case BLKIF_OP_INDIRECT:
+ dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
+ dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
+ dst->u.indirect.handle = src->u.indirect.handle;
+ dst->u.indirect.id = src->u.indirect.id;
+ dst->u.indirect.sector_number = src->u.indirect.sector_number;
+ barrier();
+ j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
+ for (i = 0; i < j; i++)
+ dst->u.indirect.indirect_grefs[i] =
+ src->u.indirect.indirect_grefs[i];
+ break;
default:
/*
* Don't know how to translate this op. Only get the
@@ -304,7 +435,7 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
static inline void blkif_get_x86_64_req(struct blkif_request *dst,
struct blkif_x86_64_request *src)
{
- int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
dst->operation = src->operation;
switch (src->operation) {
case BLKIF_OP_READ:
@@ -327,6 +458,18 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst,
dst->u.discard.sector_number = src->u.discard.sector_number;
dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
break;
+ case BLKIF_OP_INDIRECT:
+ dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
+ dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
+ dst->u.indirect.handle = src->u.indirect.handle;
+ dst->u.indirect.id = src->u.indirect.id;
+ dst->u.indirect.sector_number = src->u.indirect.sector_number;
+ barrier();
+ j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
+ for (i = 0; i < j; i++)
+ dst->u.indirect.indirect_grefs[i] =
+ src->u.indirect.indirect_grefs[i];
+ break;
default:
/*
* Don't know how to translate this op. Only get the
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 04608a6502d7..fe5c3cd10c34 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -98,12 +98,17 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
err = PTR_ERR(blkif->xenblkd);
blkif->xenblkd = NULL;
xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
+ return;
}
}
static struct xen_blkif *xen_blkif_alloc(domid_t domid)
{
struct xen_blkif *blkif;
+ struct pending_req *req, *n;
+ int i, j;
+
+ BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL);
if (!blkif)
@@ -118,8 +123,57 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
blkif->st_print = jiffies;
init_waitqueue_head(&blkif->waiting_to_free);
blkif->persistent_gnts.rb_node = NULL;
+ spin_lock_init(&blkif->free_pages_lock);
+ INIT_LIST_HEAD(&blkif->free_pages);
+ blkif->free_pages_num = 0;
+ atomic_set(&blkif->persistent_gnt_in_use, 0);
+
+ INIT_LIST_HEAD(&blkif->pending_free);
+
+ for (i = 0; i < XEN_BLKIF_REQS; i++) {
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ goto fail;
+ list_add_tail(&req->free_list,
+ &blkif->pending_free);
+ for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
+ req->segments[j] = kzalloc(sizeof(*req->segments[0]),
+ GFP_KERNEL);
+ if (!req->segments[j])
+ goto fail;
+ }
+ for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
+ req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]),
+ GFP_KERNEL);
+ if (!req->indirect_pages[j])
+ goto fail;
+ }
+ }
+ spin_lock_init(&blkif->pending_free_lock);
+ init_waitqueue_head(&blkif->pending_free_wq);
+ init_waitqueue_head(&blkif->shutdown_wq);
return blkif;
+
+fail:
+ list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
+ list_del(&req->free_list);
+ for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
+ if (!req->segments[j])
+ break;
+ kfree(req->segments[j]);
+ }
+ for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
+ if (!req->indirect_pages[j])
+ break;
+ kfree(req->indirect_pages[j]);
+ }
+ kfree(req);
+ }
+
+ kmem_cache_free(xen_blkif_cachep, blkif);
+
+ return ERR_PTR(-ENOMEM);
}
static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
@@ -178,6 +232,7 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
{
if (blkif->xenblkd) {
kthread_stop(blkif->xenblkd);
+ wake_up(&blkif->shutdown_wq);
blkif->xenblkd = NULL;
}
@@ -198,8 +253,28 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
static void xen_blkif_free(struct xen_blkif *blkif)
{
+ struct pending_req *req, *n;
+ int i = 0, j;
+
if (!atomic_dec_and_test(&blkif->refcnt))
BUG();
+
+ /* Check that there is no request in use */
+ list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
+ list_del(&req->free_list);
+
+ for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
+ kfree(req->segments[j]);
+
+ for (j = 0; j < MAX_INDIRECT_PAGES; j++)
+ kfree(req->indirect_pages[j]);
+
+ kfree(req);
+ i++;
+ }
+
+ WARN_ON(i != XEN_BLKIF_REQS);
+
kmem_cache_free(xen_blkif_cachep, blkif);
}
@@ -678,6 +753,11 @@ again:
dev->nodename);
goto abort;
}
+ err = xenbus_printf(xbt, dev->nodename, "feature-max-indirect-segments", "%u",
+ MAX_INDIRECT_SEGMENTS);
+ if (err)
+ dev_warn(&dev->dev, "writing %s/feature-max-indirect-segments (%d)",
+ dev->nodename, err);
err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
(unsigned long long)vbd_sz(&be->blkif->vbd));
@@ -704,6 +784,11 @@ again:
dev->nodename);
goto abort;
}
+ err = xenbus_printf(xbt, dev->nodename, "physical-sector-size", "%u",
+ bdev_physical_block_size(be->blkif->vbd.bdev));
+ if (err)
+ xenbus_dev_error(dev, err, "writing %s/physical-sector-size",
+ dev->nodename);
err = xenbus_transaction_end(xbt, 0);
if (err == -EAGAIN)