diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 16 | ||||
-rw-r--r-- | block/Makefile | 1 | ||||
-rw-r--r-- | block/blk-barrier.c | 2 | ||||
-rw-r--r-- | block/blk-core.c | 196 | ||||
-rw-r--r-- | block/blk-integrity.c | 25 | ||||
-rw-r--r-- | block/blk-merge.c | 95 | ||||
-rw-r--r-- | block/blk-settings.c | 2 | ||||
-rw-r--r-- | block/blk-softirq.c | 2 | ||||
-rw-r--r-- | block/blk-sysfs.c | 98 | ||||
-rw-r--r-- | block/blk-timeout.c | 9 | ||||
-rw-r--r-- | block/blk.h | 10 | ||||
-rw-r--r-- | block/blktrace.c | 890 | ||||
-rw-r--r-- | block/bsg.c | 27 | ||||
-rw-r--r-- | block/cfq-iosched.c | 43 | ||||
-rw-r--r-- | block/cmd-filter.c | 1 | ||||
-rw-r--r-- | block/elevator.c | 2 | ||||
-rw-r--r-- | block/genhd.c | 24 | ||||
-rw-r--r-- | block/scsi_ioctl.c | 21 |
18 files changed, 365 insertions, 1099 deletions
diff --git a/block/Kconfig b/block/Kconfig index 0cbb3b88b59a..e7d12782bcfb 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -44,22 +44,6 @@ config LBD If unsure, say N. -config BLK_DEV_IO_TRACE - bool "Support for tracing block io actions" - depends on SYSFS - select RELAY - select DEBUG_FS - select TRACEPOINTS - help - Say Y here if you want to be able to trace the block layer actions - on a given queue. Tracing allows you to see any traffic happening - on a block device queue. For more information (and the userspace - support tools needed), fetch the blktrace tools from: - - git://git.kernel.dk/blktrace.git - - If unsure, say N. - config BLK_DEV_BSG bool "Block layer SG support v4 (EXPERIMENTAL)" depends on EXPERIMENTAL diff --git a/block/Makefile b/block/Makefile index bfe73049f939..e9fa4dd690f2 100644 --- a/block/Makefile +++ b/block/Makefile @@ -13,6 +13,5 @@ obj-$(CONFIG_IOSCHED_AS) += as-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o -obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 8eba4e43bb0c..f7dae57e6cab 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -302,7 +302,7 @@ static void bio_end_empty_barrier(struct bio *bio, int err) * Description: * Issue a flush for the block device in question. Caller can supply * room for storing the error offset in case of a flush error, if they - * wish to. Caller must run wait_for_completion() on its own. + * wish to. */ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) { diff --git a/block/blk-core.c b/block/blk-core.c index a824e49c0d0a..25572802dac2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -64,11 +64,12 @@ static struct workqueue_struct *kblockd_workqueue; static void drive_stat_acct(struct request *rq, int new_io) { + struct gendisk *disk = rq->rq_disk; struct hd_struct *part; int rw = rq_data_dir(rq); int cpu; - if (!blk_fs_request(rq) || !rq->rq_disk) + if (!blk_fs_request(rq) || !disk || !blk_do_io_stat(disk->queue)) return; cpu = part_stat_lock(); @@ -483,11 +484,11 @@ static int blk_init_free_list(struct request_queue *q) { struct request_list *rl = &q->rq; - rl->count[READ] = rl->count[WRITE] = 0; - rl->starved[READ] = rl->starved[WRITE] = 0; + rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; + rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; rl->elvpriv = 0; - init_waitqueue_head(&rl->wait[READ]); - init_waitqueue_head(&rl->wait[WRITE]); + init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); + init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, request_cachep, q->node); @@ -599,17 +600,13 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) q->request_fn = rfn; q->prep_rq_fn = NULL; q->unplug_fn = generic_unplug_device; - q->queue_flags = (1 << QUEUE_FLAG_CLUSTER | - 1 << QUEUE_FLAG_STACKABLE); + q->queue_flags = QUEUE_FLAG_DEFAULT; q->queue_lock = lock; - blk_queue_segment_boundary(q, BLK_SEG_BOUNDARY_MASK); - + /* + * This also sets hw/phys segments, boundary and size + */ blk_queue_make_request(q, __make_request); - blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE); - - blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); - blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); q->sg_reserved_size = INT_MAX; @@ -702,18 +699,18 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) ioc->last_waited = jiffies; } -static void __freed_request(struct request_queue *q, int rw) +static void __freed_request(struct request_queue *q, int sync) { struct request_list *rl = &q->rq; - if (rl->count[rw] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, rw); + if (rl->count[sync] < queue_congestion_off_threshold(q)) + blk_clear_queue_congested(q, sync); - if (rl->count[rw] + 1 <= q->nr_requests) { - if (waitqueue_active(&rl->wait[rw])) - wake_up(&rl->wait[rw]); + if (rl->count[sync] + 1 <= q->nr_requests) { + if (waitqueue_active(&rl->wait[sync])) + wake_up(&rl->wait[sync]); - blk_clear_queue_full(q, rw); + blk_clear_queue_full(q, sync); } } @@ -721,21 +718,20 @@ static void __freed_request(struct request_queue *q, int rw) * A request has just been released. Account for it, update the full and * congestion status, wake up any waiters. Called under q->queue_lock. */ -static void freed_request(struct request_queue *q, int rw, int priv) +static void freed_request(struct request_queue *q, int sync, int priv) { struct request_list *rl = &q->rq; - rl->count[rw]--; + rl->count[sync]--; if (priv) rl->elvpriv--; - __freed_request(q, rw); + __freed_request(q, sync); - if (unlikely(rl->starved[rw ^ 1])) - __freed_request(q, rw ^ 1); + if (unlikely(rl->starved[sync ^ 1])) + __freed_request(q, sync ^ 1); } -#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist) /* * Get a free request, queue_lock must be held. * Returns NULL on failure, with queue_lock held. @@ -747,15 +743,15 @@ static struct request *get_request(struct request_queue *q, int rw_flags, struct request *rq = NULL; struct request_list *rl = &q->rq; struct io_context *ioc = NULL; - const int rw = rw_flags & 0x01; + const bool is_sync = rw_is_sync(rw_flags) != 0; int may_queue, priv; may_queue = elv_may_queue(q, rw_flags); if (may_queue == ELV_MQUEUE_NO) goto rq_starved; - if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) { - if (rl->count[rw]+1 >= q->nr_requests) { + if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { + if (rl->count[is_sync]+1 >= q->nr_requests) { ioc = current_io_context(GFP_ATOMIC, q->node); /* * The queue will fill after this allocation, so set @@ -763,9 +759,9 @@ static struct request *get_request(struct request_queue *q, int rw_flags, * This process will be allowed to complete a batch of * requests, others will be blocked. */ - if (!blk_queue_full(q, rw)) { + if (!blk_queue_full(q, is_sync)) { ioc_set_batching(q, ioc); - blk_set_queue_full(q, rw); + blk_set_queue_full(q, is_sync); } else { if (may_queue != ELV_MQUEUE_MUST && !ioc_batching(q, ioc)) { @@ -778,7 +774,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, } } } - blk_set_queue_congested(q, rw); + blk_set_queue_congested(q, is_sync); } /* @@ -786,11 +782,11 @@ static struct request *get_request(struct request_queue *q, int rw_flags, * limit of requests, otherwise we could have thousands of requests * allocated with any setting of ->nr_requests */ - if (rl->count[rw] >= (3 * q->nr_requests / 2)) + if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) goto out; - rl->count[rw]++; - rl->starved[rw] = 0; + rl->count[is_sync]++; + rl->starved[is_sync] = 0; priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); if (priv) @@ -808,7 +804,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, * wait queue, but this is pretty rare. */ spin_lock_irq(q->queue_lock); - freed_request(q, rw, priv); + freed_request(q, is_sync, priv); /* * in the very unlikely event that allocation failed and no @@ -818,8 +814,8 @@ static struct request *get_request(struct request_queue *q, int rw_flags, * rq mempool into READ and WRITE */ rq_starved: - if (unlikely(rl->count[rw] == 0)) - rl->starved[rw] = 1; + if (unlikely(rl->count[is_sync] == 0)) + rl->starved[is_sync] = 1; goto out; } @@ -833,7 +829,7 @@ rq_starved: if (ioc_batching(q, ioc)) ioc->nr_batch_requests--; - trace_block_getrq(q, bio, rw); + trace_block_getrq(q, bio, rw_flags & 1); out: return rq; } @@ -847,7 +843,7 @@ out: static struct request *get_request_wait(struct request_queue *q, int rw_flags, struct bio *bio) { - const int rw = rw_flags & 0x01; + const bool is_sync = rw_is_sync(rw_flags) != 0; struct request *rq; rq = get_request(q, rw_flags, bio, GFP_NOIO); @@ -856,10 +852,10 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, struct io_context *ioc; struct request_list *rl = &q->rq; - prepare_to_wait_exclusive(&rl->wait[rw], &wait, + prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, TASK_UNINTERRUPTIBLE); - trace_block_sleeprq(q, bio, rw); + trace_block_sleeprq(q, bio, rw_flags & 1); __generic_unplug_device(q); spin_unlock_irq(q->queue_lock); @@ -875,7 +871,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, ioc_set_batching(q, ioc); spin_lock_irq(q->queue_lock); - finish_wait(&rl->wait[rw], &wait); + finish_wait(&rl->wait[is_sync], &wait); rq = get_request(q, rw_flags, bio, GFP_NOIO); }; @@ -1066,19 +1062,22 @@ void __blk_put_request(struct request_queue *q, struct request *req) elv_completed_request(q, req); + /* this is a bio leak */ + WARN_ON(req->bio != NULL); + /* * Request may not have originated from ll_rw_blk. if not, * it didn't come out of our reserved rq pools */ if (req->cmd_flags & REQ_ALLOCED) { - int rw = rq_data_dir(req); + int is_sync = rq_is_sync(req) != 0; int priv = req->cmd_flags & REQ_ELVPRIV; BUG_ON(!list_empty(&req->queuelist)); BUG_ON(!hlist_unhashed(&req->hash)); blk_free_request(q, req); - freed_request(q, rw, priv); + freed_request(q, is_sync, priv); } } EXPORT_SYMBOL_GPL(__blk_put_request); @@ -1125,8 +1124,12 @@ void init_request_from_bio(struct request *req, struct bio *bio) if (bio_sync(bio)) req->cmd_flags |= REQ_RW_SYNC; + if (bio_unplug(bio)) + req->cmd_flags |= REQ_UNPLUG; if (bio_rw_meta(bio)) req->cmd_flags |= REQ_RW_META; + if (bio_noidle(bio)) + req->cmd_flags |= REQ_NOIDLE; req->errors = 0; req->hard_sector = req->sector = bio->bi_sector; @@ -1135,12 +1138,22 @@ void init_request_from_bio(struct request *req, struct bio *bio) blk_rq_bio_prep(req->q, req, bio); } +/* + * Only disabling plugging for non-rotational devices if it does tagging + * as well, otherwise we do need the proper merging + */ +static inline bool queue_should_plug(struct request_queue *q) +{ + return !(blk_queue_nonrot(q) && blk_queue_tagged(q)); +} + static int __make_request(struct request_queue *q, struct bio *bio) { struct request *req; int el_ret, nr_sectors; const unsigned short prio = bio_prio(bio); const int sync = bio_sync(bio); + const int unplug = bio_unplug(bio); int rw_flags; nr_sectors = bio_sectors(bio); @@ -1240,11 +1253,11 @@ get_rq: if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || bio_flagged(bio, BIO_CPU_AFFINE)) req->cpu = blk_cpu_to_group(smp_processor_id()); - if (!blk_queue_nonrot(q) && elv_queue_empty(q)) + if (queue_should_plug(q) && elv_queue_empty(q)) blk_plug_device(q); add_request(q, req); out: - if (sync || blk_queue_nonrot(q)) + if (unplug || !queue_should_plug(q)) __generic_unplug_device(q); spin_unlock_irq(q->queue_lock); return 0; @@ -1448,6 +1461,11 @@ static inline void __generic_make_request(struct bio *bio) err = -EOPNOTSUPP; goto end_io; } + if (bio_barrier(bio) && bio_has_data(bio) && + (q->next_ordered == QUEUE_ORDERED_NONE)) { + err = -EOPNOTSUPP; + goto end_io; + } ret = q->make_request_fn(q, bio); } while (ret); @@ -1655,6 +1673,55 @@ void blkdev_dequeue_request(struct request *req) } EXPORT_SYMBOL(blkdev_dequeue_request); +static void blk_account_io_completion(struct request *req, unsigned int bytes) +{ + struct gendisk *disk = req->rq_disk; + + if (!disk || !blk_do_io_stat(disk->queue)) + return; + + if (blk_fs_request(req)) { + const int rw = rq_data_dir(req); + struct hd_struct *part; + int cpu; + + cpu = part_stat_lock(); + part = disk_map_sector_rcu(req->rq_disk, req->sector); + part_stat_add(cpu, part, sectors[rw], bytes >> 9); + part_stat_unlock(); + } +} + +static void blk_account_io_done(struct request *req) +{ + struct gendisk *disk = req->rq_disk; + + if (!disk || !blk_do_io_stat(disk->queue)) + return; + + /* + * Account IO completion. bar_rq isn't accounted as a normal + * IO on queueing nor completion. Accounting the containing + * request is enough. + */ + if (blk_fs_request(req) && req != &req->q->bar_rq) { + unsigned long duration = jiffies - req->start_time; + const int rw = rq_data_dir(req); + struct hd_struct *part; + int cpu; + + cpu = part_stat_lock(); + part = disk_map_sector_rcu(disk, req->sector); + + part_stat_inc(cpu, part, ios[rw]); + part_stat_add(cpu, part, ticks[rw], duration); + part_round_stats(cpu, part); + part_dec_in_flight(part); + + part_stat_unlock(); + } +} + /** * __end_that_request_first - end I/O on a request * @req: the request being processed @@ -1690,16 +1757,7 @@ static int __end_that_request_first(struct request *req, int error, (unsigned long long)req->sector); } - if (blk_fs_request(req) && req->rq_disk) { - const int rw = rq_data_dir(req); - struct hd_struct *part; - int cpu; - - cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, req->sector); - part_stat_add(cpu, part, sectors[rw], nr_bytes >> 9); - part_stat_unlock(); - } + blk_account_io_completion(req, nr_bytes); total_bytes = bio_nbytes = 0; while ((bio = req->bio) != NULL) { @@ -1779,8 +1837,6 @@ static int __end_that_request_first(struct request *req, int error, */ static void end_that_request_last(struct request *req, int error) { - struct gendisk *disk = req->rq_disk; - if (blk_rq_tagged(req)) blk_queue_end_tag(req->q, req); @@ -1792,27 +1848,7 @@ static void end_that_request_last(struct request *req, int error) blk_delete_timer(req); - /* - * Account IO completion. bar_rq isn't accounted as a normal - * IO on queueing nor completion. Accounting the containing - * request is enough. - */ - if (disk && blk_fs_request(req) && req != &req->q->bar_rq) { - unsigned long duration = jiffies - req->start_time; - const int rw = rq_data_dir(req); - struct hd_struct *part; - int cpu; - - cpu = part_stat_lock(); - part = disk_map_sector_rcu(disk, req->sector); - - part_stat_inc(cpu, part, ios[rw]); - part_stat_add(cpu, part, ticks[rw], duration); - part_round_stats(cpu, part); - part_dec_in_flight(part); - - part_stat_unlock(); - } + blk_account_io_done(req); if (req->end_io) req->end_io(req, error); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 61a8e2f8fdd0..91fa8e06b6a5 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -309,24 +309,24 @@ static struct kobj_type integrity_ktype = { /** * blk_integrity_register - Register a gendisk as being integrity-capable * @disk: struct gendisk pointer to make integrity-aware - * @template: integrity profile + * @template: optional integrity profile to register * * Description: When a device needs to advertise itself as being able * to send/receive integrity metadata it must use this function to * register the capability with the block layer. The template is a * blk_integrity struct with values appropriate for the underlying - * hardware. See Documentation/block/data-integrity.txt. + * hardware. If template is NULL the new profile is allocated but + * not filled out. See Documentation/block/data-integrity.txt. */ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) { struct blk_integrity *bi; BUG_ON(disk == NULL); - BUG_ON(template == NULL); if (disk->integrity == NULL) { bi = kmem_cache_alloc(integrity_cachep, - GFP_KERNEL | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); if (!bi) return -1; @@ -346,13 +346,16 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) bi = disk->integrity; /* Use the provided profile as template */ - bi->name = template->name; - bi->generate_fn = template->generate_fn; - bi->verify_fn = template->verify_fn; - bi->tuple_size = template->tuple_size; - bi->set_tag_fn = template->set_tag_fn; - bi->get_tag_fn = template->get_tag_fn; - bi->tag_size = template->tag_size; + if (template != NULL) { + bi->name = template->name; + bi->generate_fn = template->generate_fn; + bi->verify_fn = template->verify_fn; + bi->tuple_size = template->tuple_size; + bi->set_tag_fn = template->set_tag_fn; + bi->get_tag_fn = template->get_tag_fn; + bi->tag_size = template->tag_size; + } else + bi->name = "unsupported"; return 0; } diff --git a/block/blk-merge.c b/block/blk-merge.c index b92f5b0866b0..e39cb24b7679 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -38,72 +38,77 @@ void blk_recalc_rq_sectors(struct request *rq, int nsect) } } -void blk_recalc_rq_segments(struct request *rq) +static unsigned int __blk_recalc_rq_segments(struct request_queue *q, + struct bio *bio) { - int nr_phys_segs; unsigned int phys_size; struct bio_vec *bv, *bvprv = NULL; - int seg_size; - int cluster; - struct req_iterator iter; - int high, highprv = 1; - struct request_queue *q = rq->q; + int cluster, i, high, highprv = 1; + unsigned int seg_size, nr_phys_segs; + struct bio *fbio, *bbio; - if (!rq->bio) - return; + if (!bio) + return 0; + fbio = bio; cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); seg_size = 0; phys_size = nr_phys_segs = 0; - rq_for_each_segment(bv, rq, iter) { - /* - * the trick here is making sure that a high page is never - * considered part of another segment, since that might - * change with the bounce page. - */ - high = page_to_pfn(bv->bv_page) > q->bounce_pfn; - if (high || highprv) - goto new_segment; - if (cluster) { - if (seg_size + bv->bv_len > q->max_segment_size) - goto new_segment; - if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv)) - goto new_segment; - if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv)) + for_each_bio(bio) { + bio_for_each_segment(bv, bio, i) { + /* + * the trick here is making sure that a high page is + * never considered part of another segment, since that + * might change with the bounce page. + */ + high = page_to_pfn(bv->bv_page) > q->bounce_pfn; + if (high || highprv) goto new_segment; + if (cluster) { + if (seg_size + bv->bv_len > q->max_segment_size) + goto new_segment; + if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv)) + goto new_segment; + if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv)) + goto new_segment; + + seg_size += bv->bv_len; + bvprv = bv; + continue; + } +new_segment: + if (nr_phys_segs == 1 && seg_size > + fbio->bi_seg_front_size) + fbio->bi_seg_front_size = seg_size; - seg_size += bv->bv_len; + nr_phys_segs++; bvprv = bv; - continue; + seg_size = bv->bv_len; + highprv = high; } -new_segment: - if (nr_phys_segs == 1 && seg_size > rq->bio->bi_seg_front_size) - rq->bio->bi_seg_front_size = seg_size; - - nr_phys_segs++; - bvprv = bv; - seg_size = bv->bv_len; - highprv = high; + bbio = bio; } - if (nr_phys_segs == 1 && seg_size > rq->bio->bi_seg_front_size) - rq->bio->bi_seg_front_size = seg_size; - if (seg_size > rq->biotail->bi_seg_back_size) - rq->biotail->bi_seg_back_size = seg_size; + if (nr_phys_segs == 1 && seg_size > fbio->bi_seg_front_size) + fbio->bi_seg_front_size = seg_size; + if (seg_size > bbio->bi_seg_back_size) + bbio->bi_seg_back_size = seg_size; + + return nr_phys_segs; +} - rq->nr_phys_segments = nr_phys_segs; +void blk_recalc_rq_segments(struct request *rq) +{ + rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio); } void blk_recount_segments(struct request_queue *q, struct bio *bio) { - struct request rq; struct bio *nxt = bio->bi_next; - rq.q = q; - rq.bio = rq.biotail = bio; + bio->bi_next = NULL; - blk_recalc_rq_segments(&rq); + bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio); bio->bi_next = nxt; - bio->bi_phys_segments = rq.nr_phys_segments; bio->bi_flags |= (1 << BIO_SEG_VALID); } EXPORT_SYMBOL(blk_recount_segments); @@ -398,6 +403,8 @@ static int attempt_merge(struct request_queue *q, struct request *req, if (blk_rq_cpu_valid(next)) req->cpu = next->cpu; + /* owner-ship of bio passed from next to req */ + next->bio = NULL; __blk_put_request(q, next); return 1; } diff --git a/block/blk-settings.c b/block/blk-settings.c index 59fd05d9f1d5..69c42adde52b 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -431,7 +431,7 @@ EXPORT_SYMBOL(blk_queue_segment_boundary); * * description: * set required memory and length alignment for direct dma transactions. - * this is used when buiding direct io requests for the queue. + * this is used when building direct io requests for the queue. * **/ void blk_queue_dma_alignment(struct request_queue *q, int mask) diff --git a/block/blk-softirq.c b/block/blk-softirq.c index ce0efc6b26dc..ee9c21602228 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -64,7 +64,7 @@ static int raise_blk_irq(int cpu, struct request *rq) data->info = rq; data->flags = 0; - __smp_call_function_single(cpu, data); + __smp_call_function_single(cpu, data, 0); return 0; } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index a29cb788e408..3ff9bba3379a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -48,28 +48,28 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) q->nr_requests = nr; blk_queue_congestion_threshold(q); - if (rl->count[READ] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, READ); - else if (rl->count[READ] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, READ); - - if (rl->count[WRITE] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, WRITE); - else if (rl->count[WRITE] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, WRITE); - - if (rl->count[READ] >= q->nr_requests) { - blk_set_queue_full(q, READ); - } else if (rl->count[READ]+1 <= q->nr_requests) { - blk_clear_queue_full(q, READ); - wake_up(&rl->wait[READ]); + if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) + blk_set_queue_congested(q, BLK_RW_SYNC); + else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) + blk_clear_queue_congested(q, BLK_RW_SYNC); + + if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) + blk_set_queue_congested(q, BLK_RW_ASYNC); + else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) + blk_clear_queue_congested(q, BLK_RW_ASYNC); + + if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { + blk_set_queue_full(q, BLK_RW_SYNC); + } else if (rl->count[BLK_RW_SYNC]+1 <= q->nr_requests) { + blk_clear_queue_full(q, BLK_RW_SYNC); + wake_up(&rl->wait[BLK_RW_SYNC]); } - if (rl->count[WRITE] >= q->nr_requests) { - blk_set_queue_full(q, WRITE); - } else if (rl->count[WRITE]+1 <= q->nr_requests) { - blk_clear_queue_full(q, WRITE); - wake_up(&rl->wait[WRITE]); + if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { + blk_set_queue_full(q, BLK_RW_ASYNC); + } else if (rl->count[BLK_RW_ASYNC]+1 <= q->nr_requests) { + blk_clear_queue_full(q, BLK_RW_ASYNC); + wake_up(&rl->wait[BLK_RW_ASYNC]); } spin_unlock_irq(q->queue_lock); return ret; @@ -130,6 +130,27 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) return queue_var_show(max_hw_sectors_kb, (page)); } +static ssize_t queue_nonrot_show(struct request_queue *q, char *page) +{ + return queue_var_show(!blk_queue_nonrot(q), page); +} + +static ssize_t queue_nonrot_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned long nm; + ssize_t ret = queue_var_store(&nm, page, count); + + spin_lock_irq(q->queue_lock); + if (nm) + queue_flag_clear(QUEUE_FLAG_NONROT, q); + else + queue_flag_set(QUEUE_FLAG_NONROT, q); + spin_unlock_irq(q->queue_lock); + + return ret; +} + static ssize_t queue_nomerges_show(struct request_queue *q, char *page) { return queue_var_show(blk_queue_nomerges(q), page); @@ -146,8 +167,8 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, queue_flag_set(QUEUE_FLAG_NOMERGES, q); else queue_flag_clear(QUEUE_FLAG_NOMERGES, q); - spin_unlock_irq(q->queue_lock); + return ret; } @@ -176,6 +197,27 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) return ret; } +static ssize_t queue_iostats_show(struct request_queue *q, char *page) +{ + return queue_var_show(blk_queue_io_stat(q), page); +} + +static ssize_t queue_iostats_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned long stats; + ssize_t ret = queue_var_store(&stats, page, count); + + spin_lock_irq(q->queue_lock); + if (stats) + queue_flag_set(QUEUE_FLAG_IO_STAT, q); + else + queue_flag_clear(QUEUE_FLAG_IO_STAT, q); + spin_unlock_irq(q->queue_lock); + + return ret; +} + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -210,6 +252,12 @@ static struct queue_sysfs_entry queue_hw_sector_size_entry = { .show = queue_hw_sector_size_show, }; +static struct queue_sysfs_entry queue_nonrot_entry = { + .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, + .show = queue_nonrot_show, + .store = queue_nonrot_store, +}; + static struct queue_sysfs_entry queue_nomerges_entry = { .attr = {.name = "nomerges", .mode = S_IRUGO | S_IWUSR }, .show = queue_nomerges_show, @@ -222,6 +270,12 @@ static struct queue_sysfs_entry queue_rq_affinity_entry = { .store = queue_rq_affinity_store, }; +static struct queue_sysfs_entry queue_iostats_entry = { + .attr = {.name = "iostats", .mode = S_IRUGO | S_IWUSR }, + .show = queue_iostats_show, + .store = queue_iostats_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -229,8 +283,10 @@ static struct attribute *default_attrs[] = { &queue_max_sectors_entry.attr, &queue_iosched_entry.attr, &queue_hw_sector_size_entry.attr, + &queue_nonrot_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, + &queue_iostats_entry.attr, NULL, }; diff --git a/block/blk-timeout.c b/block/blk-timeout.c index a09535377a94..bbbdc4b8ccf2 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -209,12 +209,19 @@ void blk_abort_queue(struct request_queue *q) { unsigned long flags; struct request *rq, *tmp; + LIST_HEAD(list); spin_lock_irqsave(q->queue_lock, flags); elv_abort_queue(q); - list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) + /* + * Splice entries to local list, to avoid deadlocking if entries + * get readded to the timeout list by error handling + */ + list_splice_init(&q->timeout_list, &list); + + list_for_each_entry_safe(rq, tmp, &list, timeout_list) blk_abort_request(rq); spin_unlock_irqrestore(q->queue_lock, flags); diff --git a/block/blk.h b/block/blk.h index 6e1ed40534e9..3ee94358b43d 100644 --- a/block/blk.h +++ b/block/blk.h @@ -102,10 +102,18 @@ static inline int blk_cpu_to_group(int cpu) const struct cpumask *mask = cpu_coregroup_mask(cpu); return cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) - return first_cpu(per_cpu(cpu_sibling_map, cpu)); + return cpumask_first(topology_thread_cpumask(cpu)); #else return cpu; #endif } +static inline int blk_do_io_stat(struct request_queue *q) +{ + if (q) + return blk_queue_io_stat(q); + + return 0; +} + #endif diff --git a/block/blktrace.c b/block/blktrace.c deleted file mode 100644 index b0a2cae886db..000000000000 --- a/block/blktrace.c +++ /dev/null @@ -1,890 +0,0 @@ -/* - * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - */ -#include <linux/kernel.h> -#include <linux/blkdev.h> -#include <linux/blktrace_api.h> -#include <linux/percpu.h> -#include <linux/init.h> -#include <linux/mutex.h> -#include <linux/debugfs.h> -#include <linux/time.h> -#include <trace/block.h> -#include <asm/uaccess.h> - -static unsigned int blktrace_seq __read_mostly = 1; - -/* Global reference count of probes */ -static DEFINE_MUTEX(blk_probe_mutex); -static atomic_t blk_probes_ref = ATOMIC_INIT(0); - -static int blk_register_tracepoints(void); -static void blk_unregister_tracepoints(void); - -/* - * Send out a notify message. - */ -static void trace_note(struct blk_trace *bt, pid_t pid, int action, - const void *data, size_t len) -{ - struct blk_io_trace *t; - - t = relay_reserve(bt->rchan, sizeof(*t) + len); - if (t) { - const int cpu = smp_processor_id(); - - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->time = ktime_to_ns(ktime_get()); - t->device = bt->dev; - t->action = action; - t->pid = pid; - t->cpu = cpu; - t->pdu_len = len; - memcpy((void *) t + sizeof(*t), data, len); - } -} - -/* - * Send out a notify for this process, if we haven't done so since a trace - * started - */ -static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) -{ - tsk->btrace_seq = blktrace_seq; - trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); -} - -static void trace_note_time(struct blk_trace *bt) -{ - struct timespec now; - unsigned long flags; - u32 words[2]; - - getnstimeofday(&now); - words[0] = now.tv_sec; - words[1] = now.tv_nsec; - - local_irq_save(flags); - trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words)); - local_irq_restore(flags); -} - -void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) -{ - int n; - va_list args; - unsigned long flags; - char *buf; - - local_irq_save(flags); - buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); - va_start(args, fmt); - n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); - va_end(args); - - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(__trace_note_message); - -static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, - pid_t pid) -{ - if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) - return 1; - if (sector < bt->start_lba || sector > bt->end_lba) - return 1; - if (bt->pid && pid != bt->pid) - return 1; - - return 0; -} - -/* - * Data direction bit lookup - */ -static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; - -/* The ilog2() calls fall out because they're constant */ -#define MASK_TC_BIT(rw, __name) ( (rw & (1 << BIO_RW_ ## __name)) << \ - (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name) ) - -/* - * The worker for the various blk_add_trace*() types. Fills out a - * blk_io_trace structure and places it in a per-cpu subbuffer. - */ -static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, - int rw, u32 what, int error, int pdu_len, void *pdu_data) -{ - struct task_struct *tsk = current; - struct blk_io_trace *t; - unsigned long flags; - unsigned long *sequence; - pid_t pid; - int cpu; - - if (unlikely(bt->trace_state != Blktrace_running)) - return; - - what |= ddir_act[rw & WRITE]; - what |= MASK_TC_BIT(rw, BARRIER); - what |= MASK_TC_BIT(rw, SYNC); - what |= MASK_TC_BIT(rw, AHEAD); - what |= MASK_TC_BIT(rw, META); - what |= MASK_TC_BIT(rw, DISCARD); - - pid = tsk->pid; - if (unlikely(act_log_check(bt, what, sector, pid))) - return; - - /* - * A word about the locking here - we disable interrupts to reserve - * some space in the relay per-cpu buffer, to prevent an irq - * from coming in and stepping on our toes. - */ - local_irq_save(flags); - - if (unlikely(tsk->btrace_seq != blktrace_seq)) - trace_note_tsk(bt, tsk); - - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); - if (t) { - cpu = smp_processor_id(); - sequence = per_cpu_ptr(bt->sequence, cpu); - - t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; - t->sequence = ++(*sequence); - t->time = ktime_to_ns(ktime_get()); - t->sector = sector; - t->bytes = bytes; - t->action = what; - t->pid = pid; - t->device = bt->dev; - t->cpu = cpu; - t->error = error; - t->pdu_len = pdu_len; - - if (pdu_len) - memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); - } - - local_irq_restore(flags); -} - -static struct dentry *blk_tree_root; -static DEFINE_MUTEX(blk_tree_mutex); -static unsigned int root_users; - -static inline void blk_remove_root(void) -{ - if (blk_tree_root) { - debugfs_remove(blk_tree_root); - blk_tree_root = NULL; - } -} - -static void blk_remove_tree(struct dentry *dir) -{ - mutex_lock(&blk_tree_mutex); - debugfs_remove(dir); - if (--root_users == 0) - blk_remove_root(); - mutex_unlock(&blk_tree_mutex); -} - -static struct dentry *blk_create_tree(const char *blk_name) -{ - struct dentry *dir = NULL; - int created = 0; - - mutex_lock(&blk_tree_mutex); - - if (!blk_tree_root) { - blk_tree_root = debugfs_create_dir("block", NULL); - if (!blk_tree_root) - goto err; - created = 1; - } - - dir = debugfs_create_dir(blk_name, blk_tree_root); - if (dir) - root_users++; - else { - /* Delete root only if we created it */ - if (created) - blk_remove_root(); - } - -err: - mutex_unlock(&blk_tree_mutex); - return dir; -} - -static void blk_trace_cleanup(struct blk_trace *bt) -{ - relay_close(bt->rchan); - debugfs_remove(bt->msg_file); - debugfs_remove(bt->dropped_file); - blk_remove_tree(bt->dir); - free_percpu(bt->sequence); - free_percpu(bt->msg_data); - kfree(bt); - mutex_lock(&blk_probe_mutex); - if (atomic_dec_and_test(&blk_probes_ref)) - blk_unregister_tracepoints(); - mutex_unlock(&blk_probe_mutex); -} - -int blk_trace_remove(struct request_queue *q) -{ - struct blk_trace *bt; - - bt = xchg(&q->blk_trace, NULL); - if (!bt) - return -EINVAL; - - if (bt->trace_state == Blktrace_setup || - bt->trace_state == Blktrace_stopped) - blk_trace_cleanup(bt); - - return 0; -} -EXPORT_SYMBOL_GPL(blk_trace_remove); - -static int blk_dropped_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - - return 0; -} - -static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, - size_t count, loff_t *ppos) -{ - struct blk_trace *bt = filp->private_data; - char buf[16]; - - snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); - - return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); -} - -static const struct file_operations blk_dropped_fops = { - .owner = THIS_MODULE, - .open = blk_dropped_open, - .read = blk_dropped_read, -}; - -static int blk_msg_open(struct inode *inode, struct file *filp) -{ - filp->private_data = inode->i_private; - - return 0; -} - -static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, - size_t count, loff_t *ppos) -{ - char *msg; - struct blk_trace *bt; - - if (count > BLK_TN_MAX_MSG) - return -EINVAL; - - msg = kmalloc(count, GFP_KERNEL); - if (msg == NULL) - return -ENOMEM; - - if (copy_from_user(msg, buffer, count)) { - kfree(msg); - return -EFAULT; - } - - bt = filp->private_data; - __trace_note_message(bt, "%s", msg); - kfree(msg); - - return count; -} - -static const struct file_operations blk_msg_fops = { - .owner = THIS_MODULE, - .open = blk_msg_open, - .write = blk_msg_write, -}; - -/* - * Keep track of how many times we encountered a full subbuffer, to aid - * the user space app in telling how many lost events there were. - */ -static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) -{ - struct blk_trace *bt; - - if (!relay_buf_full(buf)) - return 1; - - bt = buf->chan->private_data; - atomic_inc(&bt->dropped); - return 0; -} - -static int blk_remove_buf_file_callback(struct dentry *dentry) -{ - debugfs_remove(dentry); - return 0; -} - -static struct dentry *blk_create_buf_file_callback(const char *filename, - struct dentry *parent, - int mode, - struct rchan_buf *buf, - int *is_global) -{ - return debugfs_create_file(filename, mode, parent, buf, - &relay_file_operations); -} - -static struct rchan_callbacks blk_relay_callbacks = { - .subbuf_start = blk_subbuf_start_callback, - .create_buf_file = blk_create_buf_file_callback, - .remove_buf_file = blk_remove_buf_file_callback, -}; - -/* - * Setup everything required to start tracing - */ -int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct blk_user_trace_setup *buts) -{ - struct blk_trace *old_bt, *bt = NULL; - struct dentry *dir = NULL; - int ret, i; - - if (!buts->buf_size || !buts->buf_nr) - return -EINVAL; - - strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); - buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; - - /* - * some device names have larger paths - convert the slashes - * to underscores for this to work as expected - */ - for (i = 0; i < strlen(buts->name); i++) - if (buts->name[i] == '/') - buts->name[i] = '_'; - - ret = -ENOMEM; - bt = kzalloc(sizeof(*bt), GFP_KERNEL); - if (!bt) - goto err; - - bt->sequence = alloc_percpu(unsigned long); - if (!bt->sequence) - goto err; - - bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG); - if (!bt->msg_data) - goto err; - - ret = -ENOENT; - dir = blk_create_tree(buts->name); - if (!dir) - goto err; - - bt->dir = dir; - bt->dev = dev; - atomic_set(&bt->dropped, 0); - - ret = -EIO; - bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); - if (!bt->dropped_file) - goto err; - - bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); - if (!bt->msg_file) - goto err; - - bt->rchan = relay_open("trace", dir, buts->buf_size, - buts->buf_nr, &blk_relay_callbacks, bt); - if (!bt->rchan) - goto err; - - bt->act_mask = buts->act_mask; - if (!bt->act_mask) - bt->act_mask = (u16) -1; - - bt->start_lba = buts->start_lba; - bt->end_lba = buts->end_lba; - if (!bt->end_lba) - bt->end_lba = -1ULL; - - bt->pid = buts->pid; - bt->trace_state = Blktrace_setup; - - mutex_lock(&blk_probe_mutex); - if (atomic_add_return(1, &blk_probes_ref) == 1) { - ret = blk_register_tracepoints(); - if (ret) - goto probe_err; - } - mutex_unlock(&blk_probe_mutex); - - ret = -EBUSY; - old_bt = xchg(&q->blk_trace, bt); - if (old_bt) { - (void) xchg(&q->blk_trace, old_bt); - goto err; - } - - return 0; -probe_err: - atomic_dec(&blk_probes_ref); - mutex_unlock(&blk_probe_mutex); -err: - if (dir) - blk_remove_tree(dir); - if (bt) { - if (bt->msg_file) - debugfs_remove(bt->msg_file); - if (bt->dropped_file) - debugfs_remove(bt->dropped_file); - free_percpu(bt->sequence); - free_percpu(bt->msg_data); - if (bt->rchan) - relay_close(bt->rchan); - kfree(bt); - } - return ret; -} - -int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - char __user *arg) -{ - struct blk_user_trace_setup buts; - int ret; - - ret = copy_from_user(&buts, arg, sizeof(buts)); - if (ret) - return -EFAULT; - - ret = do_blk_trace_setup(q, name, dev, &buts); - if (ret) - return ret; - - if (copy_to_user(arg, &buts, sizeof(buts))) - return -EFAULT; - - return 0; -} -EXPORT_SYMBOL_GPL(blk_trace_setup); - -int blk_trace_startstop(struct request_queue *q, int start) -{ - struct blk_trace *bt; - int ret; - - if ((bt = q->blk_trace) == NULL) - return -EINVAL; - - /* - * For starting a trace, we can transition from a setup or stopped - * trace. For stopping a trace, the state must be running - */ - ret = -EINVAL; - if (start) { - if (bt->trace_state == Blktrace_setup || - bt->trace_state == Blktrace_stopped) { - blktrace_seq++; - smp_mb(); - bt->trace_state = Blktrace_running; - - trace_note_time(bt); - ret = 0; - } - } else { - if (bt->trace_state == Blktrace_running) { - bt->trace_state = Blktrace_stopped; - relay_flush(bt->rchan); - ret = 0; - } - } - - return ret; -} -EXPORT_SYMBOL_GPL(blk_trace_startstop); - -/** - * blk_trace_ioctl: - handle the ioctls associated with tracing - * @bdev: the block device - * @cmd: the ioctl cmd - * @arg: the argument data, if any - * - **/ -int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) -{ - struct request_queue *q; - int ret, start = 0; - char b[BDEVNAME_SIZE]; - - q = bdev_get_queue(bdev); - if (!q) - return -ENXIO; - - mutex_lock(&bdev->bd_mutex); - - switch (cmd) { - case BLKTRACESETUP: - bdevname(bdev, b); - ret = blk_trace_setup(q, b, bdev->bd_dev, arg); - break; - case BLKTRACESTART: - start = 1; - case BLKTRACESTOP: - ret = blk_trace_startstop(q, start); - break; - case BLKTRACETEARDOWN: - ret = blk_trace_remove(q); - break; - default: - ret = -ENOTTY; - break; - } - - mutex_unlock(&bdev->bd_mutex); - return ret; -} - -/** - * blk_trace_shutdown: - stop and cleanup trace structures - * @q: the request queue associated with the device - * - **/ -void blk_trace_shutdown(struct request_queue *q) -{ - if (q->blk_trace) { - blk_trace_startstop(q, 0); - blk_trace_remove(q); - } -} - -/* - * blktrace probes - */ - -/** - * blk_add_trace_rq - Add a trace for a request oriented action - * @q: queue the io is for - * @rq: the source request - * @what: the action - * - * Description: - * Records an action against a request. Will log the bio offset + size. - * - **/ -static void blk_add_trace_rq(struct request_queue *q, struct request *rq, - u32 what) -{ - struct blk_trace *bt = q->blk_trace; - int rw = rq->cmd_flags & 0x03; - - if (likely(!bt)) - return; - - if (blk_discard_rq(rq)) - rw |= (1 << BIO_RW_DISCARD); - - if (blk_pc_request(rq)) { - what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, - sizeof(rq->cmd), rq->cmd); - } else { - what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, - rw, what, rq->errors, 0, NULL); - } -} - -static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_ABORT); -} - -static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_INSERT); -} - -static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_ISSUE); -} - -static void blk_add_trace_rq_requeue(struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); -} - -static void blk_add_trace_rq_complete(struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); -} - -/** - * blk_add_trace_bio - Add a trace for a bio oriented action - * @q: queue the io is for - * @bio: the source bio - * @what: the action - * - * Description: - * Records an action against a bio. Will log the bio offset + size. - * - **/ -static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what) -{ - struct blk_trace *bt = q->blk_trace; - - if (likely(!bt)) - return; - - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, - !bio_flagged(bio, BIO_UPTODATE), 0, NULL); -} - -static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio) -{ - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); -} - -static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) -{ - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); -} - -static void blk_add_trace_bio_backmerge(struct request_queue *q, struct bio *bio) -{ - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); -} - -static void blk_add_trace_bio_frontmerge(struct request_queue *q, struct bio *bio) -{ - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); -} - -static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) -{ - blk_add_trace_bio(q, bio, BLK_TA_QUEUE); -} - -static void blk_add_trace_getrq(struct request_queue *q, struct bio *bio, int rw) -{ - if (bio) - blk_add_trace_bio(q, bio, BLK_TA_GETRQ); - else { - struct blk_trace *bt = q->blk_trace; - - if (bt) - __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); - } -} - - -static void blk_add_trace_sleeprq(struct request_queue *q, struct bio *bio, int rw) -{ - if (bio) - blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); - else { - struct blk_trace *bt = q->blk_trace; - - if (bt) - __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, 0, 0, NULL); - } -} - -static void blk_add_trace_plug(struct request_queue *q) -{ - struct blk_trace *bt = q->blk_trace; - - if (bt) - __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); -} - -static void blk_add_trace_unplug_io(struct request_queue *q) -{ - struct blk_trace *bt = q->blk_trace; - - if (bt) { - unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; - __be64 rpdu = cpu_to_be64(pdu); - - __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, - sizeof(rpdu), &rpdu); - } -} - -static void blk_add_trace_unplug_timer(struct request_queue *q) -{ - struct blk_trace *bt = q->blk_trace; - - if (bt) { - unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; - __be64 rpdu = cpu_to_be64(pdu); - - __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, - sizeof(rpdu), &rpdu); - } -} - -static void blk_add_trace_split(struct request_queue *q, struct bio *bio, - unsigned int pdu) -{ - struct blk_trace *bt = q->blk_trace; - - if (bt) { - __be64 rpdu = cpu_to_be64(pdu); - - __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, - BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE), - sizeof(rpdu), &rpdu); - } -} - -/** - * blk_add_trace_remap - Add a trace for a remap operation - * @q: queue the io is for - * @bio: the source bio - * @dev: target device - * @from: source sector - * @to: target sector - * - * Description: - * Device mapper or raid target sometimes need to split a bio because - * it spans a stripe (or similar). Add a trace for that action. - * - **/ -static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from, sector_t to) -{ - struct blk_trace *bt = q->blk_trace; - struct blk_io_trace_remap r; - - if (likely(!bt)) - return; - - r.device = cpu_to_be32(dev); - r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); - r.sector = cpu_to_be64(to); - - __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, - !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); -} - -/** - * blk_add_driver_data - Add binary message with driver-specific data - * @q: queue the io is for - * @rq: io request - * @data: driver-specific data - * @len: length of driver-specific data - * - * Description: - * Some drivers might want to write driver-specific data per request. - * - **/ -void blk_add_driver_data(struct request_queue *q, - struct request *rq, - void *data, size_t len) -{ - struct blk_trace *bt = q->blk_trace; - - if (likely(!bt)) - return; - - if (blk_pc_request(rq)) - __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, - rq->errors, len, data); - else - __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, - 0, BLK_TA_DRV_DATA, rq->errors, len, data); -} -EXPORT_SYMBOL_GPL(blk_add_driver_data); - -static int blk_register_tracepoints(void) -{ - int ret; - - ret = register_trace_block_rq_abort(blk_add_trace_rq_abort); - WARN_ON(ret); - ret = register_trace_block_rq_insert(blk_add_trace_rq_insert); - WARN_ON(ret); - ret = register_trace_block_rq_issue(blk_add_trace_rq_issue); - WARN_ON(ret); - ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue); - WARN_ON(ret); - ret = register_trace_block_rq_complete(blk_add_trace_rq_complete); - WARN_ON(ret); - ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce); - WARN_ON(ret); - ret = register_trace_block_bio_complete(blk_add_trace_bio_complete); - WARN_ON(ret); - ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); - WARN_ON(ret); - ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); - WARN_ON(ret); - ret = register_trace_block_bio_queue(blk_add_trace_bio_queue); - WARN_ON(ret); - ret = register_trace_block_getrq(blk_add_trace_getrq); - WARN_ON(ret); - ret = register_trace_block_sleeprq(blk_add_trace_sleeprq); - WARN_ON(ret); - ret = register_trace_block_plug(blk_add_trace_plug); - WARN_ON(ret); - ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer); - WARN_ON(ret); - ret = register_trace_block_unplug_io(blk_add_trace_unplug_io); - WARN_ON(ret); - ret = register_trace_block_split(blk_add_trace_split); - WARN_ON(ret); - ret = register_trace_block_remap(blk_add_trace_remap); - WARN_ON(ret); - return 0; -} - -static void blk_unregister_tracepoints(void) -{ - unregister_trace_block_remap(blk_add_trace_remap); - unregister_trace_block_split(blk_add_trace_split); - unregister_trace_block_unplug_io(blk_add_trace_unplug_io); - unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer); - unregister_trace_block_plug(blk_add_trace_plug); - unregister_trace_block_sleeprq(blk_add_trace_sleeprq); - unregister_trace_block_getrq(blk_add_trace_getrq); - unregister_trace_block_bio_queue(blk_add_trace_bio_queue); - unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); - unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); - unregister_trace_block_bio_complete(blk_add_trace_bio_complete); - unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce); - unregister_trace_block_rq_complete(blk_add_trace_rq_complete); - unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue); - unregister_trace_block_rq_issue(blk_add_trace_rq_issue); - unregister_trace_block_rq_insert(blk_add_trace_rq_insert); - unregister_trace_block_rq_abort(blk_add_trace_rq_abort); - - tracepoint_synchronize_unregister(); -} diff --git a/block/bsg.c b/block/bsg.c index d414bb5607e8..206060e795da 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -218,9 +218,6 @@ bsg_validate_sgv4_hdr(struct request_queue *q, struct sg_io_v4 *hdr, int *rw) if (hdr->guard != 'Q') return -EINVAL; - if (hdr->dout_xfer_len > (q->max_sectors << 9) || - hdr->din_xfer_len > (q->max_sectors << 9)) - return -EIO; switch (hdr->protocol) { case BSG_PROTOCOL_SCSI: @@ -244,7 +241,8 @@ bsg_validate_sgv4_hdr(struct request_queue *q, struct sg_io_v4 *hdr, int *rw) * map sg_io_v4 to a request. */ static struct request * -bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm) +bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, + u8 *sense) { struct request_queue *q = bd->queue; struct request *rq, *next_rq = NULL; @@ -306,6 +304,10 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm) if (ret) goto out; } + + rq->sense = sense; + rq->sense_len = 0; + return rq; out: if (rq->cmd != rq->__cmd) @@ -348,8 +350,7 @@ static void bsg_rq_end_io(struct request *rq, int uptodate) static void bsg_add_command(struct bsg_device *bd, struct request_queue *q, struct bsg_command *bc, struct request *rq) { - rq->sense = bc->sense; - rq->sense_len = 0; + int at_head = (0 == (bc->hdr.flags & BSG_FLAG_Q_AT_TAIL)); /* * add bc command to busy queue and submit rq for io @@ -366,7 +367,7 @@ static void bsg_add_command(struct bsg_device *bd, struct request_queue *q, dprintk("%s: queueing rq %p, bc %p\n", bd->name, rq, bc); rq->end_io_data = bc; - blk_execute_rq_nowait(q, NULL, rq, 1, bsg_rq_end_io); + blk_execute_rq_nowait(q, NULL, rq, at_head, bsg_rq_end_io); } static struct bsg_command *bsg_next_done_cmd(struct bsg_device *bd) @@ -419,7 +420,7 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, { int ret = 0; - dprintk("rq %p bio %p %u\n", rq, bio, rq->errors); + dprintk("rq %p bio %p 0x%x\n", rq, bio, rq->errors); /* * fill in all the output members */ @@ -635,7 +636,7 @@ static int __bsg_write(struct bsg_device *bd, const char __user *buf, /* * get a request, fill in the blanks, and add to request queue */ - rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm); + rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm, bc->sense); if (IS_ERR(rq)) { ret = PTR_ERR(rq); rq = NULL; @@ -922,18 +923,22 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct request *rq; struct bio *bio, *bidi_bio = NULL; struct sg_io_v4 hdr; + int at_head; + u8 sense[SCSI_SENSE_BUFFERSIZE]; if (copy_from_user(&hdr, uarg, sizeof(hdr))) return -EFAULT; - rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE); + rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE, sense); if (IS_ERR(rq)) return PTR_ERR(rq); bio = rq->bio; if (rq->next_rq) bidi_bio = rq->next_rq->bio; - blk_execute_rq(bd->queue, NULL, rq, 0); + + at_head = (0 == (hdr.flags & BSG_FLAG_Q_AT_TAIL)); + blk_execute_rq(bd->queue, NULL, rq, at_head); ret = blk_complete_sgv4_hdr_rq(rq, &hdr, bio, bidi_bio); if (copy_to_user(uarg, &hdr, sizeof(hdr))) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index e8525fa72823..9e809345f71a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -84,6 +84,11 @@ struct cfq_data { */ struct cfq_rb_root service_tree; unsigned int busy_queues; + /* + * Used to track any pending rt requests so we can pre-empt current + * non-RT cfqq in service when this value is non-zero. + */ + unsigned int busy_rt_queues; int rq_in_driver; int sync_flight; @@ -562,6 +567,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); cfq_mark_cfqq_on_rr(cfqq); cfqd->busy_queues++; + if (cfq_class_rt(cfqq)) + cfqd->busy_rt_queues++; cfq_resort_rr_list(cfqd, cfqq); } @@ -581,6 +588,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; + if (cfq_class_rt(cfqq)) + cfqd->busy_rt_queues--; } /* @@ -1005,6 +1014,20 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) goto expire; /* + * If we have a RT cfqq waiting, then we pre-empt the current non-rt + * cfqq. + */ + if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues) { + /* + * We simulate this as cfqq timed out so that it gets to bank + * the remaining of its time slice. + */ + cfq_log_cfqq(cfqd, cfqq, "preempt"); + cfq_slice_expired(cfqd, 1); + goto new_queue; + } + + /* * The active queue has requests and isn't expired, allow it to * dispatch. */ @@ -1067,6 +1090,13 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (RB_EMPTY_ROOT(&cfqq->sort_list)) break; + /* + * If there is a non-empty RT cfqq waiting for current + * cfqq's timeslice to complete, pre-empt this cfqq + */ + if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues) + break; + } while (dispatched < max_dispatch); /* @@ -1801,6 +1831,12 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (rq_is_meta(rq) && !cfqq->meta_pending) return 1; + /* + * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. + */ + if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) + return 1; + if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) return 0; @@ -1870,7 +1906,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, /* * not the active queue - expire current slice if it is * idle and has expired it's mean thinktime or this new queue - * has some old slice time left and is of higher priority + * has some old slice time left and is of higher priority or + * this new queue is RT and the current one is BE */ cfq_preempt_queue(cfqd, cfqq); cfq_mark_cfqq_must_dispatch(cfqq); @@ -1955,8 +1992,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) } if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) cfq_slice_expired(cfqd, 1); - else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list)) + else if (sync && !rq_noidle(rq) && + RB_EMPTY_ROOT(&cfqq->sort_list)) { cfq_arm_slice_timer(cfqd); + } } if (!cfqd->rq_in_driver) diff --git a/block/cmd-filter.c b/block/cmd-filter.c index 504b275e1b90..572bbc2f900d 100644 --- a/block/cmd-filter.c +++ b/block/cmd-filter.c @@ -22,6 +22,7 @@ #include <linux/spinlock.h> #include <linux/capability.h> #include <linux/bitops.h> +#include <linux/blkdev.h> #include <scsi/scsi.h> #include <linux/cdrom.h> diff --git a/block/elevator.c b/block/elevator.c index 98259eda0ef6..ca6788a0195a 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -677,7 +677,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) } if (unplug_it && blk_queue_plugged(q)) { - int nrq = q->rq.count[READ] + q->rq.count[WRITE] + int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC] - q->in_flight; if (nrq >= q->unplug_thresh) diff --git a/block/genhd.c b/block/genhd.c index 397960cf26af..a9ec910974c1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -256,6 +256,22 @@ void blkdev_show(struct seq_file *seqf, off_t offset) } #endif /* CONFIG_PROC_FS */ +/** + * register_blkdev - register a new block device + * + * @major: the requested major device number [1..255]. If @major=0, try to + * allocate any unused major number. + * @name: the name of the new block device as a zero terminated string + * + * The @name must be unique within the system. + * + * The return value depends on the @major input parameter. + * - if a major device number was requested in range [1..255] then the + * function returns zero on success, or a negative error code + * - if any unused major number was requested with @major=0 parameter + * then the return value is the allocated major number in range + * [1..255] or a negative error code otherwise + */ int register_blkdev(unsigned int major, const char *name) { struct blk_major_name **n, *p; @@ -1087,6 +1103,14 @@ dev_t blk_lookup_devt(const char *name, int partno) if (strcmp(dev_name(dev), name)) continue; + if (partno < disk->minors) { + /* We need to return the right devno, even + * if the partition doesn't exist yet. + */ + devt = MKDEV(MAJOR(dev->devt), + MINOR(dev->devt) + partno); + break; + } part = disk_get_part(disk, partno); if (part) { devt = part_devt(part); diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index ee9c67d7e1be..626ee274c5c4 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -214,21 +214,10 @@ static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, return 0; } -/* - * unmap a request that was previously mapped to this sg_io_hdr. handles - * both sg and non-sg sg_io_hdr. - */ -static int blk_unmap_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr) -{ - blk_rq_unmap_user(rq->bio); - blk_put_request(rq); - return 0; -} - static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, struct bio *bio) { - int r, ret = 0; + int ret = 0; /* * fill in all the output members @@ -253,12 +242,10 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, ret = -EFAULT; } - rq->bio = bio; - r = blk_unmap_sghdr_rq(rq, hdr); - if (ret) - r = ret; + blk_rq_unmap_user(bio); + blk_put_request(rq); - return r; + return ret; } static int sg_io(struct request_queue *q, struct gendisk *bd_disk, |