From 05caf8dbc1880415df3378cfd114d832c9618b60 Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Thu, 22 May 2008 15:13:29 +0200 Subject: block: Move the second call to get_request to the end of the loop In function get_request_wait, the second call to get_request could be moved to the end of the while loop, because if the first call to get_request fails, the second call will fail without sleep. Signed-off-by: Zhang Yanmin Signed-off-by: Jens Axboe --- block/blk-core.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 6a9cc0d22a61..1905aaba49fb 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -806,35 +806,32 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags, rq = get_request(q, rw_flags, bio, GFP_NOIO); while (!rq) { DEFINE_WAIT(wait); + struct io_context *ioc; struct request_list *rl = &q->rq; prepare_to_wait_exclusive(&rl->wait[rw], &wait, TASK_UNINTERRUPTIBLE); - rq = get_request(q, rw_flags, bio, GFP_NOIO); - - if (!rq) { - struct io_context *ioc; + blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); - blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); - - __generic_unplug_device(q); - spin_unlock_irq(q->queue_lock); - io_schedule(); + __generic_unplug_device(q); + spin_unlock_irq(q->queue_lock); + io_schedule(); - /* - * After sleeping, we become a "batching" process and - * will be able to allocate at least one request, and - * up to a big batch of them for a small period time. - * See ioc_batching, ioc_set_batching - */ - ioc = current_io_context(GFP_NOIO, q->node); - ioc_set_batching(q, ioc); + /* + * After sleeping, we become a "batching" process and + * will be able to allocate at least one request, and + * up to a big batch of them for a small period time. + * See ioc_batching, ioc_set_batching + */ + ioc = current_io_context(GFP_NOIO, q->node); + ioc_set_batching(q, ioc); - spin_lock_irq(q->queue_lock); - } + spin_lock_irq(q->queue_lock); finish_wait(&rl->wait[rw], &wait); - } + + rq = get_request(q, rw_flags, bio, GFP_NOIO); + }; return rq; } -- cgit v1.2.3 From be754d2c2161c0cce11d62727016985ecb76831b Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Fri, 23 May 2008 06:52:00 +0200 Subject: block: reorder cfq_queue to save space on 64bit builds saves 8 bytes of padding & increases objects/slab from 30 to 32 on my AMD64 config Signed-off-by: Richard Kennedy Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index b399c62936e0..4df3f0522435 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -124,6 +124,8 @@ struct cfq_data { struct cfq_queue { /* reference count */ atomic_t ref; + /* various state flags, see below */ + unsigned int flags; /* parent cfq_data */ struct cfq_data *cfqd; /* service_tree member */ @@ -138,14 +140,14 @@ struct cfq_queue { int queued[2]; /* currently allocated requests */ int allocated[2]; - /* pending metadata requests */ - int meta_pending; /* fifo list of requests in sort_list */ struct list_head fifo; unsigned long slice_end; long slice_resid; + /* pending metadata requests */ + int meta_pending; /* number of requests that are on the dispatch list or inside driver */ int dispatched; @@ -153,8 +155,6 @@ struct cfq_queue { unsigned short ioprio, org_ioprio; unsigned short ioprio_class, org_ioprio_class; - /* various state flags, see below */ - unsigned int flags; }; enum cfqq_state_flags { -- cgit v1.2.3 From 9d5f09a424a67ddb959829894efb4c71cbf6d600 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Tue, 27 May 2008 14:54:41 +0200 Subject: Added in MESSAGE notes for blktraces Allows messages to be inserted into blktrace streams. Signed-off-by: Alan D. Brunelle Signed-off-by: Jens Axboe --- block/blktrace.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'block') diff --git a/block/blktrace.c b/block/blktrace.c index b2cbb4e5d767..20e11f354f11 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -75,6 +75,20 @@ static void trace_note_time(struct blk_trace *bt) local_irq_restore(flags); } +void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) +{ + int n; + va_list args; + static char bt_msg_buf[BLK_TN_MAX_MSG]; + + va_start(args, fmt); + n = vscnprintf(bt_msg_buf, BLK_TN_MAX_MSG, fmt, args); + va_end(args); + + trace_note(bt, 0, BLK_TN_MESSAGE, bt_msg_buf, n); +} +EXPORT_SYMBOL_GPL(__trace_note_message); + static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, pid_t pid) { -- cgit v1.2.3 From 4722dc52a891ab6cb2d637ddb87233e0ce277827 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Tue, 27 May 2008 14:55:00 +0200 Subject: Added in elevator switch message to blktrace stream Signed-off-by: Alan D. Brunelle Signed-off-by: Jens Axboe --- block/elevator.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 980f8ae147b4..902dd1344d56 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -1110,6 +1110,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); spin_unlock_irq(q->queue_lock); + blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name); + return 1; fail_register: -- cgit v1.2.3 From 64565911cdb57c2f512a9715b985b5617402cc67 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 May 2008 14:45:33 +0200 Subject: block: make blktrace use per-cpu buffers for message notes Currently it uses a single static char array, but that risks being corrupted when multiple users issue message notes at the same time. Make the buffers dynamically allocated when the trace is setup and make them per-cpu instead. The default max message size of 1k is also very large, the interface is mainly for small text notes. So shrink it to 128 bytes. Signed-off-by: Jens Axboe --- block/blktrace.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blktrace.c b/block/blktrace.c index 20e11f354f11..7ae87cc4a163 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -79,13 +79,16 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) { int n; va_list args; - static char bt_msg_buf[BLK_TN_MAX_MSG]; + char *buf; + preempt_disable(); + buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); va_start(args, fmt); - n = vscnprintf(bt_msg_buf, BLK_TN_MAX_MSG, fmt, args); + n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); - trace_note(bt, 0, BLK_TN_MESSAGE, bt_msg_buf, n); + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); + preempt_enable(); } EXPORT_SYMBOL_GPL(__trace_note_message); @@ -246,6 +249,7 @@ static void blk_trace_cleanup(struct blk_trace *bt) debugfs_remove(bt->dropped_file); blk_remove_tree(bt->dir); free_percpu(bt->sequence); + free_percpu(bt->msg_data); kfree(bt); } @@ -360,6 +364,10 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->sequence) goto err; + bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG); + if (!bt->msg_data) + goto err; + ret = -ENOENT; dir = blk_create_tree(buts->name); if (!dir) @@ -406,6 +414,7 @@ err: if (bt->dropped_file) debugfs_remove(bt->dropped_file); free_percpu(bt->sequence); + free_percpu(bt->msg_data); if (bt->rchan) relay_close(bt->rchan); kfree(bt); -- cgit v1.2.3 From d6de8be711b28049a5cb93c954722c311c7d3f7f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 May 2008 14:46:59 +0200 Subject: cfq-iosched: fix RCU problem in cfq_cic_lookup() cfq_cic_lookup() needs to properly protect ioc->ioc_data before dereferencing it and also exclude updaters of ioc->ioc_data as well. Also add a number of comments documenting why the existing RCU usage is OK. Thanks a lot to "Paul E. McKenney" for review and comments! Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 4df3f0522435..d01b411c72f0 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1142,6 +1142,9 @@ static void cfq_put_queue(struct cfq_queue *cfqq) kmem_cache_free(cfq_pool, cfqq); } +/* + * Must always be called with the rcu_read_lock() held + */ static void __call_for_each_cic(struct io_context *ioc, void (*func)(struct io_context *, struct cfq_io_context *)) @@ -1197,6 +1200,11 @@ static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) cfq_cic_free(cic); } +/* + * Must be called with rcu_read_lock() held or preemption otherwise disabled. + * Only two callers of this - ->dtor() which is called with the rcu_read_lock(), + * and ->trim() which is called with the task lock held + */ static void cfq_free_io_context(struct io_context *ioc) { /* @@ -1502,20 +1510,24 @@ static struct cfq_io_context * cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) { struct cfq_io_context *cic; + unsigned long flags; void *k; if (unlikely(!ioc)) return NULL; + rcu_read_lock(); + /* * we maintain a last-hit cache, to avoid browsing over the tree */ cic = rcu_dereference(ioc->ioc_data); - if (cic && cic->key == cfqd) + if (cic && cic->key == cfqd) { + rcu_read_unlock(); return cic; + } do { - rcu_read_lock(); cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd); rcu_read_unlock(); if (!cic) @@ -1524,10 +1536,13 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) k = cic->key; if (unlikely(!k)) { cfq_drop_dead_cic(cfqd, ioc, cic); + rcu_read_lock(); continue; } + spin_lock_irqsave(&ioc->lock, flags); rcu_assign_pointer(ioc->ioc_data, cic); + spin_unlock_irqrestore(&ioc->lock, flags); break; } while (1); @@ -2134,6 +2149,10 @@ static void *cfq_init_queue(struct request_queue *q) static void cfq_slab_kill(void) { + /* + * Caller already ensured that pending RCU callbacks are completed, + * so we should have no busy allocations at this point. + */ if (cfq_pool) kmem_cache_destroy(cfq_pool); if (cfq_ioc_pool) @@ -2292,6 +2311,11 @@ static void __exit cfq_exit(void) ioc_gone = &all_gone; /* ioc_gone's update must be visible before reading ioc_count */ smp_wmb(); + + /* + * this also protects us from entering cfq_slab_kill() with + * pending RCU callbacks + */ if (elv_ioc_count_read(ioc_count)) wait_for_completion(ioc_gone); cfq_slab_kill(); -- cgit v1.2.3 From d5791d13b1d45542895104edf4b09476d5ad24b0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 9 Jun 2008 10:06:24 -0700 Subject: Fix invalid access errors in blk_lookup_devt Commit 30f2f0eb4bd2c43d10a8b0d872c6e5ad8f31c9a0 ("block: do_mounts - accept root=") extended blk_lookup_devt() to be able to look up partitions that had not yet been registered, but in the process made the assumption that the '&block_class.devices' list only contains disk devices and that you can do 'dev_to_disk(dev)' on them. That isn't actually true. The block_class device list also contains the partitions we've discovered so far, and you can't just do a 'dev_to_disk()' on those. So make sure to only work on devices that block/genhd.c has registered itself, something we can test by checking the 'dev->type' member. This makes the loop in blk_lookup_devt() match the other such loops in this file. [ We may want to do an alternate version that knows to handle _either_ whole-disk devices or partitions, but for now this is the minimal fix for a series of crashes reported by Mariusz Kozlowski in http://lkml.org/lkml/2008/5/25/25 and Ingo in http://lkml.org/lkml/2008/6/9/39 ] Reported-by: Mariusz Kozlowski Reported-by: Ingo Molnar Cc: Neil Brown Cc: Joao Luis Meloni Assirati Acked-by: Kay Sievers Cc: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- block/genhd.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 129ad939f9dd..b922d4801c87 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -660,6 +660,8 @@ dev_t blk_lookup_devt(const char *name, int part) mutex_lock(&block_class_lock); list_for_each_entry(dev, &block_class.devices, node) { + if (dev->type != &disk_type) + continue; if (strcmp(dev->bus_id, name) == 0) { struct gendisk *disk = dev_to_disk(dev); -- cgit v1.2.3 From 14a73f54798f39854e521fb596da7d50b7566bbd Mon Sep 17 00:00:00 2001 From: Carl Henrik Lunde Date: Thu, 12 Jun 2008 20:13:58 +0200 Subject: block: disable IRQs until data is written to relay channel As we may run relay_reserve from interrupt context we must always disable IRQs. This is because a call to relay_reserve may expose previously written data to use space. Updated new message code and an old but related comment. Signed-off-by: Carl Henrik Lunde Signed-off-by: Jens Axboe Signed-off-by: Linus Torvalds --- block/blktrace.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blktrace.c b/block/blktrace.c index 7ae87cc4a163..8d3a27780260 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -79,16 +79,17 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) { int n; va_list args; + unsigned long flags; char *buf; - preempt_disable(); + local_irq_save(flags); buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); va_start(args, fmt); n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); - preempt_enable(); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(__trace_note_message); @@ -158,10 +159,7 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, /* * A word about the locking here - we disable interrupts to reserve * some space in the relay per-cpu buffer, to prevent an irq - * from coming in and stepping on our toes. Once reserved, it's - * enough to get preemption disabled to prevent read of this data - * before we are through filling it. get_cpu()/put_cpu() does this - * for us + * from coming in and stepping on our toes. */ local_irq_save(flags); -- cgit v1.2.3 From d585d0b9d73ed999cc7b8cf3cac4a5b01abb544e Mon Sep 17 00:00:00 2001 From: Divyesh Shah Date: Mon, 16 Jun 2008 18:37:08 +0200 Subject: block: Fix the starving writes bug in the anticipatory IO scheduler AS scheduler alternates between issuing read and write batches. It does the batch switch only after all requests from the previous batch are completed. When switching to a write batch, if there is an on-going read request, it waits for its completion and indicates its intention of switching by setting ad->changed_batch and the new direction but does not update the batch_expire_time for the new write batch which it does in the case of no previous pending requests. On completion of the read request, it sees that we were waiting for the switch and schedules work for kblockd right away and resets the ad->changed_data flag. Now when kblockd enters dispatch_request where it is expected to pick up a write request, it in turn ends the write batch because the batch_expire_timer was not updated and shows the expire timestamp for the previous batch. This results in the write starvation for all the cases where there is the intention for switching to a write batch, but there is a previous in-flight read request and the batch gets reverted to a read_batch right away. This also holds true in the reverse case (switching from a write batch to a read batch with an in-flight write request). I've checked that this bug exists on 2.6.11, 2.6.18, 2.6.24 and linux-2.6-block git HEAD. I've tested the fix on x86 platforms with SCSI drives where the driver asks for the next request while a current request is in-flight. This patch is based off linux-2.6-block git HEAD. Bug reproduction: A simple scenario which reproduces this bug is: - dd if=/dev/hda3 of=/dev/null & - lilo The lilo takes forever to complete. This can also be reproduced fairly easily with the earlier dd and another test program doing msync(). The example test program below should print out a message after every iteration but it simply hangs forever. With this bugfix it makes forward progress. ==== Example test program using msync() (thanks to suleiman AT google DOT com) inline uint64_t rdtsc(void) { int64_t tsc; __asm __volatile("rdtsc" : "=A" (tsc)); return (tsc); } int main(int argc, char **argv) { struct stat st; uint64_t e, s, t; char *p, q; long i; int fd; if (argc < 2) { printf("Usage: %s \n", argv[0]); return (1); } if ((fd = open(argv[1], O_RDWR | O_NOATIME)) < 0) err(1, "open"); if (fstat(fd, &st) < 0) err(1, "fstat"); p = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); t = 0; for (i = 0; i < 1000; i++) { *p = 0; msync(p, 4096, MS_SYNC); s = rdtsc(); *p = 0; __asm __volatile(""::: "memory"); e = rdtsc(); if (argc > 2) printf("%d: %lld cycles %jd %jd\n", i, e - s, (intmax_t)s, (intmax_t)e); t += e - s; } printf("average time: %lld cycles\n", t / 1000); return (0); } Cc: Acked-by: Nick Piggin Signed-off-by: Jens Axboe --- block/as-iosched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/as-iosched.c b/block/as-iosched.c index 8c3946787dbb..743f33a01a07 100644 --- a/block/as-iosched.c +++ b/block/as-iosched.c @@ -831,6 +831,8 @@ static void as_completed_request(struct request_queue *q, struct request *rq) } if (ad->changed_batch && ad->nr_dispatched == 1) { + ad->current_batch_expires = jiffies + + ad->batch_expire[ad->batch_data_dir]; kblockd_schedule_work(&ad->antic_work); ad->changed_batch = 0; -- cgit v1.2.3 From 8df5fc042c8e7c08dc438c8198b62407ee1e91a0 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Mon, 7 Jul 2008 15:50:01 -0500 Subject: [SCSI] bsg: fix oops on remove If you do a modremove of any sas driver, you run into an oops on shutdown when the host is removed (coming from the host bsg device). The root cause seems to be that there's a use after free of the bsg_class_device: In bsg_kref_release_function, this is used (to do a put_device(bcg->parent) after bcg->release has been called. In sas (and possibly many other things) bcd->release frees the queue which contains the bsg_class_device, so we get a put_device on unreferenced memory. Fix this by taking a copy of the pointer to the parent before releasing bsg. Acked-by: FUJITA Tomonori Signed-off-by: James Bottomley --- block/bsg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bsg.c b/block/bsg.c index f0b7cd343216..54d617f7df3e 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -709,11 +709,12 @@ static void bsg_kref_release_function(struct kref *kref) { struct bsg_class_device *bcd = container_of(kref, struct bsg_class_device, ref); + struct device *parent = bcd->parent; if (bcd->release) bcd->release(bcd->parent); - put_device(bcd->parent); + put_device(parent); } static int bsg_put_device(struct bsg_device *bd) -- cgit v1.2.3