From bca4b914b5da3d8e7b9b647f620b71dc85c0c394 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 20 May 2010 23:21:34 +0400 Subject: cfq-iosched: remove dead_key from cfq_io_context Remove ->dead_key field from cfq_io_context to shrink its size to 128 bytes. (64 bytes for 32-bit hosts) Use lower bit in ->key as dead-mark, instead of moving key to separate field. After this for dead cfq_io_context we got cic->key != cfqd automatically. Thus, io_context's last-hit cache should work without changing. Now to check ->key for non-dead state compare it with cfqd, instead of checking ->key for non-null value as it was before. Plus remove obsolete race protection in cfq_cic_lookup. This race gone after v2.6.24-1728-g4ac845a Signed-off-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 41 ++++++++++++++++++++++++++++------------- include/linux/iocontext.h | 1 - 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ed897b5ef315..407602350350 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -430,6 +430,23 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic, cic->cfqq[is_sync] = cfqq; } +#define CIC_DEAD_KEY 1ul + +static inline void *cfqd_dead_key(struct cfq_data *cfqd) +{ + return (void *)((unsigned long) cfqd | CIC_DEAD_KEY); +} + +static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic) +{ + struct cfq_data *cfqd = cic->key; + + if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY)) + return NULL; + + return cfqd; +} + /* * We regard a request as SYNC, if it's either a read or has the SYNC bit * set (in which case it could also be direct WRITE). @@ -2510,11 +2527,12 @@ static void cfq_cic_free(struct cfq_io_context *cic) static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) { unsigned long flags; + unsigned long dead_key = (unsigned long) cic->key; - BUG_ON(!cic->dead_key); + BUG_ON(!(dead_key & CIC_DEAD_KEY)); spin_lock_irqsave(&ioc->lock, flags); - radix_tree_delete(&ioc->radix_root, cic->dead_key); + radix_tree_delete(&ioc->radix_root, dead_key & ~CIC_DEAD_KEY); hlist_del_rcu(&cic->cic_list); spin_unlock_irqrestore(&ioc->lock, flags); @@ -2573,11 +2591,10 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd, list_del_init(&cic->queue_list); /* - * Make sure key == NULL is seen for dead queues + * Make sure dead mark is seen for dead queues */ smp_wmb(); - cic->dead_key = (unsigned long) cic->key; - cic->key = NULL; + cic->key = cfqd_dead_key(cfqd); if (ioc->ioc_data == cic) rcu_assign_pointer(ioc->ioc_data, NULL); @@ -2596,7 +2613,7 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd, static void cfq_exit_single_io_context(struct io_context *ioc, struct cfq_io_context *cic) { - struct cfq_data *cfqd = cic->key; + struct cfq_data *cfqd = cic_to_cfqd(cic); if (cfqd) { struct request_queue *q = cfqd->queue; @@ -2609,7 +2626,7 @@ static void cfq_exit_single_io_context(struct io_context *ioc, * race between exiting task and queue */ smp_read_barrier_depends(); - if (cic->key) + if (cic->key == cfqd) __cfq_exit_single_io_context(cfqd, cic); spin_unlock_irqrestore(q->queue_lock, flags); @@ -2689,7 +2706,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) { - struct cfq_data *cfqd = cic->key; + struct cfq_data *cfqd = cic_to_cfqd(cic); struct cfq_queue *cfqq; unsigned long flags; @@ -2746,7 +2763,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) { struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); - struct cfq_data *cfqd = cic->key; + struct cfq_data *cfqd = cic_to_cfqd(cic); unsigned long flags; struct request_queue *q; @@ -2883,6 +2900,7 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, unsigned long flags; WARN_ON(!list_empty(&cic->queue_list)); + BUG_ON(cic->key != cfqd_dead_key(cfqd)); spin_lock_irqsave(&ioc->lock, flags); @@ -2900,7 +2918,6 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) { struct cfq_io_context *cic; unsigned long flags; - void *k; if (unlikely(!ioc)) return NULL; @@ -2921,9 +2938,7 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) rcu_read_unlock(); if (!cic) break; - /* ->key must be copied to avoid race with cfq_exit_queue() */ - k = cic->key; - if (unlikely(!k)) { + if (unlikely(cic->key != cfqd)) { cfq_drop_dead_cic(cfqd, ioc, cic); rcu_read_lock(); continue; diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index a0bb301afac0..64d529133031 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -7,7 +7,6 @@ struct cfq_queue; struct cfq_io_context { void *key; - unsigned long dead_key; struct cfq_queue *cfqq[2]; -- cgit v1.2.3 From 80b15c7389caa81a3860f9fc2ee47ec0ea572a63 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 20 May 2010 23:21:41 +0400 Subject: cfq-iosched: compact io_context radix_tree Use small consequent indexes as radix tree keys instead of sparse cfqd address. This change will reduce radix tree depth from 11 (6 for 32-bit hosts) to 1 if host have <=64 disks under cfq control, or to 0 if there only one disk. So, this patch save 10*560 bytes for each process (5*296 for 32-bit hosts) For each cfqd allocate cic index from ida. To unlink dead cic from tree without cfqd access store index into ->key. (bit 0 -- dead mark, bits 1..30 -- index: ida produce id in range 0..2^31-1) Signed-off-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 407602350350..c72e5acc573d 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -64,6 +64,9 @@ static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); +static DEFINE_SPINLOCK(cic_index_lock); +static DEFINE_IDA(cic_index_ida); + #define CFQ_PRIO_LISTS IOPRIO_BE_NR #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) @@ -271,6 +274,7 @@ struct cfq_data { unsigned int cfq_latency; unsigned int cfq_group_isolation; + unsigned int cic_index; struct list_head cic_list; /* @@ -431,10 +435,11 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic, } #define CIC_DEAD_KEY 1ul +#define CIC_DEAD_INDEX_SHIFT 1 static inline void *cfqd_dead_key(struct cfq_data *cfqd) { - return (void *)((unsigned long) cfqd | CIC_DEAD_KEY); + return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY); } static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic) @@ -2532,7 +2537,7 @@ static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) BUG_ON(!(dead_key & CIC_DEAD_KEY)); spin_lock_irqsave(&ioc->lock, flags); - radix_tree_delete(&ioc->radix_root, dead_key & ~CIC_DEAD_KEY); + radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT); hlist_del_rcu(&cic->cic_list); spin_unlock_irqrestore(&ioc->lock, flags); @@ -2906,7 +2911,7 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, BUG_ON(ioc->ioc_data == cic); - radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd); + radix_tree_delete(&ioc->radix_root, cfqd->cic_index); hlist_del_rcu(&cic->cic_list); spin_unlock_irqrestore(&ioc->lock, flags); @@ -2934,7 +2939,7 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) } do { - cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd); + cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index); rcu_read_unlock(); if (!cic) break; @@ -2971,7 +2976,7 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, spin_lock_irqsave(&ioc->lock, flags); ret = radix_tree_insert(&ioc->radix_root, - (unsigned long) cfqd, cic); + cfqd->cic_index, cic); if (!ret) hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list); spin_unlock_irqrestore(&ioc->lock, flags); @@ -3723,10 +3728,32 @@ static void cfq_exit_queue(struct elevator_queue *e) cfq_shutdown_timer_wq(cfqd); + spin_lock(&cic_index_lock); + ida_remove(&cic_index_ida, cfqd->cic_index); + spin_unlock(&cic_index_lock); + /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ call_rcu(&cfqd->rcu, cfq_cfqd_free); } +static int cfq_alloc_cic_index(void) +{ + int index, error; + + do { + if (!ida_pre_get(&cic_index_ida, GFP_KERNEL)) + return -ENOMEM; + + spin_lock(&cic_index_lock); + error = ida_get_new(&cic_index_ida, &index); + spin_unlock(&cic_index_lock); + if (error && error != -EAGAIN) + return error; + } while (error); + + return index; +} + static void *cfq_init_queue(struct request_queue *q) { struct cfq_data *cfqd; @@ -3734,10 +3761,16 @@ static void *cfq_init_queue(struct request_queue *q) struct cfq_group *cfqg; struct cfq_rb_root *st; + i = cfq_alloc_cic_index(); + if (i < 0) + return NULL; + cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); if (!cfqd) return NULL; + cfqd->cic_index = i; + /* Init root service tree */ cfqd->grp_service_tree = CFQ_RB_ROOT; @@ -3999,6 +4032,7 @@ static void __exit cfq_exit(void) */ if (elv_ioc_count_read(cfq_ioc_count)) wait_for_completion(&all_gone); + ida_destroy(&cic_index_ida); cfq_slab_kill(); } -- cgit v1.2.3 From e36f724b4ae70e443a7d152929b60059cbfa1a26 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 24 May 2010 09:07:32 +0200 Subject: block: Adjust elv_iosched_show to return "none" for bio-based DM Bio-based DM doesn't use an elevator (queue is !blk_queue_stackable()). Longer-term DM will not allocate an elevator for bio-based DM. But even then there will be small potential for an elevator to be allocated for a request-based DM table only to have a bio-based table be loaded in the end. Displaying "none" for bio-based DM will help avoid user confusion. Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/elevator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/elevator.c b/block/elevator.c index 6df2b5056b51..0abce473d606 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -1097,7 +1097,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) struct elevator_type *__e; int len = 0; - if (!q->elevator) + if (!q->elevator || !blk_queue_stackable(q)) return sprintf(name, "none\n"); elv = e->elevator_type; -- cgit v1.2.3 From 0191f8697bbdfefcd36e7b8dc3eeddfe82893e4b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 24 May 2010 19:15:57 +0200 Subject: pipe: F_SETPIPE_SZ should return -EPERM for non-root If the passed in size is larger than what has been set as the system wide limit and the user is not root, we want to return permission denied (not invalid value). Signed-off-by: Jens Axboe --- fs/pipe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/pipe.c b/fs/pipe.c index d79872eba09a..01ca9fab95fa 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1170,7 +1170,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) switch (cmd) { case F_SETPIPE_SZ: if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages) - return -EINVAL; + return -EPERM; /* * The pipe needs to be at least 2 pages large to * guarantee POSIX behaviour. -- cgit v1.2.3 From b9598db3401282bb27b4aef77e3eee12015f7f29 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 24 May 2010 19:34:43 +0200 Subject: pipe: make F_{GET,SET}PIPE_SZ deal with byte sizes Instead of requiring an exact number of pages as the argument and return value, change the API to deal with number of bytes instead. This also relaxes the requirement that the passed in size must result in a power-of-2 page array size. Round up to the nearest power-of-2 automatically and return the resulting size of the pipe on success. Signed-off-by: Jens Axboe --- fs/pipe.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 01ca9fab95fa..bdd3f96054b9 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1112,26 +1112,20 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes) * Allocate a new array of pipe buffers and copy the info over. Returns the * pipe size if successful, or return -ERROR on error. */ -static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) { struct pipe_buffer *bufs; - /* - * Must be a power-of-2 currently - */ - if (!is_power_of_2(arg)) - return -EINVAL; - /* * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't * expect a lot of shrink+grow operations, just free and allocate * again like we would do for growing. If the pipe currently * contains more buffers than arg, then return busy. */ - if (arg < pipe->nrbufs) + if (nr_pages < pipe->nrbufs) return -EBUSY; - bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL); + bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL); if (unlikely(!bufs)) return -ENOMEM; @@ -1152,8 +1146,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) pipe->curbuf = 0; kfree(pipe->bufs); pipe->bufs = bufs; - pipe->buffers = arg; - return arg; + pipe->buffers = nr_pages; + return nr_pages * PAGE_SIZE; } long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) @@ -1168,19 +1162,30 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) mutex_lock(&pipe->inode->i_mutex); switch (cmd) { - case F_SETPIPE_SZ: - if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages) + case F_SETPIPE_SZ: { + unsigned long nr_pages; + + /* + * Currently the array must be a power-of-2 size, so adjust + * upwards if needed. + */ + nr_pages = (arg + PAGE_SIZE - 1) >> PAGE_SHIFT; + nr_pages = roundup_pow_of_two(nr_pages); + + if (!capable(CAP_SYS_ADMIN) && nr_pages > pipe_max_pages) return -EPERM; + /* * The pipe needs to be at least 2 pages large to * guarantee POSIX behaviour. */ - if (arg < 2) + if (nr_pages < 2) return -EINVAL; - ret = pipe_set_size(pipe, arg); + ret = pipe_set_size(pipe, nr_pages); break; + } case F_GETPIPE_SZ: - ret = pipe->buffers; + ret = pipe->buffers * PAGE_SIZE; break; default: ret = -EINVAL; -- cgit v1.2.3 From d02a2c077fb81f3224c770be62a318165b23b486 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 25 May 2010 10:16:53 +0200 Subject: cfq-iosched: fix an oops caused by slab leak I got below oops when unloading cfq-iosched. Considering scenario: queue A merge to B, C merge to D and B will be merged to D. Before B is merged to D, we do split B. We should put B's reference for D. [ 807.768536] ============================================================================= [ 807.768539] BUG cfq_queue: Objects remaining on kmem_cache_close() [ 807.768541] ----------------------------------------------------------------------------- [ 807.768543] [ 807.768546] INFO: Slab 0xffffea0003e6b4e0 objects=26 used=1 fp=0xffff88011d584fd8 flags=0x200000000004082 [ 807.768550] Pid: 5946, comm: rmmod Tainted: G W 2.6.34-07097-gf4b87de-dirty #724 [ 807.768552] Call Trace: [ 807.768560] [] slab_err+0x8f/0x9d [ 807.768564] [] ? flush_cpu_slab+0x0/0x93 [ 807.768569] [] ? add_preempt_count+0xe/0xca [ 807.768572] [] ? sub_preempt_count+0xe/0xb6 [ 807.768577] [] ? _raw_spin_unlock+0x15/0x30 [ 807.768580] [] ? sub_preempt_count+0xe/0xb6 [ 807.768584] [] list_slab_objects+0x9b/0x19f [ 807.768588] [] ? add_preempt_count+0xc6/0xca [ 807.768591] [] kmem_cache_destroy+0x13f/0x21d [ 807.768597] [] cfq_slab_kill+0x1a/0x43 [cfq_iosched] [ 807.768601] [] cfq_exit+0x93/0x9e [cfq_iosched] [ 807.768606] [] sys_delete_module+0x1b1/0x219 [ 807.768612] [] system_call_fastpath+0x16/0x1b [ 807.768618] INFO: Object 0xffff88011d584618 @offset=1560 [ 807.768622] INFO: Allocated in cfq_get_queue+0x11e/0x274 [cfq_iosched] age=7173 cpu=1 pid=5496 [ 807.768626] ============================================================================= Cc: stable@kernel.org Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index c72e5acc573d..5ff4f4850e71 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2560,15 +2560,10 @@ static void cfq_free_io_context(struct io_context *ioc) __call_for_each_cic(ioc, cic_free_func); } -static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) +static void cfq_put_cooperator(struct cfq_queue *cfqq) { struct cfq_queue *__cfqq, *next; - if (unlikely(cfqq == cfqd->active_queue)) { - __cfq_slice_expired(cfqd, cfqq, 0); - cfq_schedule_dispatch(cfqd); - } - /* * If this queue was scheduled to merge with another queue, be * sure to drop the reference taken on that queue (and others in @@ -2584,6 +2579,16 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfq_put_queue(__cfqq); __cfqq = next; } +} + +static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + if (unlikely(cfqq == cfqd->active_queue)) { + __cfq_slice_expired(cfqd, cfqq, 0); + cfq_schedule_dispatch(cfqd); + } + + cfq_put_cooperator(cfqq); cfq_put_queue(cfqq); } @@ -3536,6 +3541,9 @@ split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) } cic_set_cfqq(cic, NULL, 1); + + cfq_put_cooperator(cfqq); + cfq_put_queue(cfqq); return NULL; } -- cgit v1.2.3 From 0ae0b5d0557264bad65e22f1e2da4b83a02c4535 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 25 May 2010 10:25:26 +0200 Subject: fs/splice.c: fix mapping_gfp_mask usage mapping_gfp_mask() is not supposed to store allocation contex details, only page location details. So mapping_gfp_mask should be applied to the pagecache page allocation, wheras normal (kernel mapped) memory should be used for surrounding allocations such as radix-tree nodes allocated by add_to_page_cache. Context modifiers should be applied on a per-callsite basis. So change splice to follow this convention (which is followed in similar code patterns in core code). Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/splice.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/splice.c b/fs/splice.c index ac22b00d86c3..740e6b9faf7a 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -354,7 +354,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, break; error = add_to_page_cache_lru(page, mapping, index, - mapping_gfp_mask(mapping)); + GFP_KERNEL); if (unlikely(error)) { page_cache_release(page); if (error == -EEXIST) -- cgit v1.2.3 From f17625b318d9b151e7bd41e31223e9d89b2aaa77 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 1 Jun 2010 11:05:22 +0200 Subject: Revert "writeback: ensure that WB_SYNC_NONE writeback with sb pinned is sync" This reverts commit 7c8a3554c683f512dbcee26faedb42e4c05f12fa. We are investigating a hang associated with the WB_SYNC_NONE changes, so revert them for now. Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5c4161f1fd9a..6753912641b4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -193,8 +193,7 @@ static void bdi_wait_on_work_clear(struct bdi_work *work) } static void bdi_alloc_queue_work(struct backing_dev_info *bdi, - struct wb_writeback_args *args, - int wait) + struct wb_writeback_args *args) { struct bdi_work *work; @@ -206,8 +205,6 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi, if (work) { bdi_work_init(work, args); bdi_queue_work(bdi, work); - if (wait) - bdi_wait_on_work_clear(work); } else { struct bdi_writeback *wb = &bdi->wb; @@ -282,7 +279,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, args.for_background = 1; } - bdi_alloc_queue_work(bdi, &args, sb_locked); + bdi_alloc_queue_work(bdi, &args); } /* @@ -912,7 +909,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) while ((work = get_next_work_item(bdi, wb)) != NULL) { struct wb_writeback_args args = work->args; - int post_clear; /* * Override sync mode, in case we must wait for completion @@ -920,13 +916,11 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) if (force_wait) work->args.sync_mode = args.sync_mode = WB_SYNC_ALL; - post_clear = WB_SYNC_ALL || args.sb_pinned; - /* * If this isn't a data integrity operation, just notify * that we have seen this work and we are now starting it. */ - if (!post_clear) + if (args.sync_mode == WB_SYNC_NONE) wb_clear_pending(wb, work); wrote += wb_writeback(wb, &args); @@ -935,7 +929,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) * This is a data integrity writeback, so only do the * notification when we have completed the work. */ - if (post_clear) + if (args.sync_mode == WB_SYNC_ALL) wb_clear_pending(wb, work); } @@ -1011,7 +1005,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages) if (!bdi_has_dirty_io(bdi)) continue; - bdi_alloc_queue_work(bdi, &args, 0); + bdi_alloc_queue_work(bdi, &args); } rcu_read_unlock(); -- cgit v1.2.3 From 0e3c9a2284f5417f196e327c254d0b84c9ee8929 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 1 Jun 2010 11:08:43 +0200 Subject: Revert "writeback: fix WB_SYNC_NONE writeback from umount" This reverts commit e913fc825dc685a444cb4c1d0f9d32f372f59861. We are investigating a hang associated with the WB_SYNC_NONE changes, so revert them for now. Conflicts: fs/fs-writeback.c mm/page-writeback.c Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 48 +++++++++++---------------------------------- fs/sync.c | 2 +- include/linux/backing-dev.h | 2 +- include/linux/writeback.h | 10 ---------- mm/page-writeback.c | 4 ++-- 5 files changed, 15 insertions(+), 51 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6753912641b4..408a7877b79d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -45,7 +45,6 @@ struct wb_writeback_args { unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; - unsigned int sb_pinned:1; }; /* @@ -231,11 +230,6 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi, .sync_mode = WB_SYNC_ALL, .nr_pages = LONG_MAX, .range_cyclic = 0, - /* - * Setting sb_pinned is not necessary for WB_SYNC_ALL, but - * lets make it explicitly clear. - */ - .sb_pinned = 1, }; struct bdi_work work; @@ -251,23 +245,21 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi, * @bdi: the backing device to write from * @sb: write inodes from this super_block * @nr_pages: the number of pages to write - * @sb_locked: caller already holds sb umount sem. * * Description: * This does WB_SYNC_NONE opportunistic writeback. The IO is only * started when this function returns, we make no guarentees on - * completion. Caller specifies whether sb umount sem is held already or not. + * completion. Caller need not hold sb s_umount semaphore. * */ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, - long nr_pages, int sb_locked) + long nr_pages) { struct wb_writeback_args args = { .sb = sb, .sync_mode = WB_SYNC_NONE, .nr_pages = nr_pages, .range_cyclic = 1, - .sb_pinned = sb_locked, }; /* @@ -592,7 +584,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc, /* * Caller must already hold the ref for this */ - if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) { + if (wbc->sync_mode == WB_SYNC_ALL) { WARN_ON(!rwsem_is_locked(&sb->s_umount)); return SB_NOT_PINNED; } @@ -766,7 +758,6 @@ static long wb_writeback(struct bdi_writeback *wb, .for_kupdate = args->for_kupdate, .for_background = args->for_background, .range_cyclic = args->range_cyclic, - .sb_pinned = args->sb_pinned, }; unsigned long oldest_jif; long wrote = 0; @@ -1214,18 +1205,6 @@ static void wait_sb_inodes(struct super_block *sb) iput(old_inode); } -static void __writeback_inodes_sb(struct super_block *sb, int sb_locked) -{ - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); - long nr_to_write; - - nr_to_write = nr_dirty + nr_unstable + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); - - bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked); -} - /** * writeback_inodes_sb - writeback dirty inodes from given super_block * @sb: the superblock @@ -1237,21 +1216,16 @@ static void __writeback_inodes_sb(struct super_block *sb, int sb_locked) */ void writeback_inodes_sb(struct super_block *sb) { - __writeback_inodes_sb(sb, 0); -} -EXPORT_SYMBOL(writeback_inodes_sb); + unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); + unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); + long nr_to_write; -/** - * writeback_inodes_sb_locked - writeback dirty inodes from given super_block - * @sb: the superblock - * - * Like writeback_inodes_sb(), except the caller already holds the - * sb umount sem. - */ -void writeback_inodes_sb_locked(struct super_block *sb) -{ - __writeback_inodes_sb(sb, 1); + nr_to_write = nr_dirty + nr_unstable + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + + bdi_start_writeback(sb->s_bdi, sb, nr_to_write); } +EXPORT_SYMBOL(writeback_inodes_sb); /** * writeback_inodes_sb_if_idle - start writeback if none underway diff --git a/fs/sync.c b/fs/sync.c index e8cbd415e50a..5a537ccd2e85 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait) if (wait) sync_inodes_sb(sb); else - writeback_inodes_sb_locked(sb); + writeback_inodes_sb(sb); if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e6e0cb5437e6..aee5f6ce166e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -106,7 +106,7 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); void bdi_unregister(struct backing_dev_info *bdi); int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb, - long nr_pages, int sb_locked); + long nr_pages); int bdi_writeback_task(struct bdi_writeback *wb); int bdi_has_dirty_io(struct backing_dev_info *bdi); void bdi_arm_supers_timer(void); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index cc97d6caf2b3..f64134653a8c 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -65,15 +65,6 @@ struct writeback_control { * so we use a single control to update them */ unsigned no_nrwrite_index_update:1; - - /* - * For WB_SYNC_ALL, the sb must always be pinned. For WB_SYNC_NONE, - * the writeback code will pin the sb for the caller. However, - * for eg umount, the caller does WB_SYNC_NONE but already has - * the sb pinned. If the below is set, caller already has the - * sb pinned. - */ - unsigned sb_pinned:1; }; /* @@ -82,7 +73,6 @@ struct writeback_control { struct bdi_writeback; int inode_wait(void *); void writeback_inodes_sb(struct super_block *); -void writeback_inodes_sb_locked(struct super_block *); int writeback_inodes_sb_if_idle(struct super_block *); void sync_inodes_sb(struct super_block *); void writeback_inodes_wbc(struct writeback_control *wbc); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b289310e2c89..5fa63bdf52e4 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -597,7 +597,7 @@ static void balance_dirty_pages(struct address_space *mapping, (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS)) > background_thresh))) - bdi_start_writeback(bdi, NULL, 0, 0); + bdi_start_writeback(bdi, NULL, 0); } void set_page_dirty_balance(struct page *page, int page_mkwrite) @@ -707,7 +707,7 @@ void laptop_mode_timer_fn(unsigned long data) */ if (bdi_has_dirty_io(&q->backing_dev_info)) - bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages, 0); + bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages); } /* -- cgit v1.2.3 From b7c335713ea130d707c22d7f7c57a8eca75ded7e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 26 May 2010 15:41:14 +0200 Subject: brd: support discard Support discard requests in brd by zeroing or deleting the underlying backing pages. This is simply to help with testing and documentation nature of brd code. Signed-off-by: Nick Piggin Signed-off-by: Jens Axboe --- drivers/block/brd.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 6081e81d5738..f1bf79d9bc0a 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -133,6 +133,28 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector) return page; } +static void brd_free_page(struct brd_device *brd, sector_t sector) +{ + struct page *page; + pgoff_t idx; + + spin_lock(&brd->brd_lock); + idx = sector >> PAGE_SECTORS_SHIFT; + page = radix_tree_delete(&brd->brd_pages, idx); + spin_unlock(&brd->brd_lock); + if (page) + __free_page(page); +} + +static void brd_zero_page(struct brd_device *brd, sector_t sector) +{ + struct page *page; + + page = brd_lookup_page(brd, sector); + if (page) + clear_highpage(page); +} + /* * Free all backing store pages and radix tree. This must only be called when * there are no other users of the device. @@ -189,6 +211,24 @@ static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n) return 0; } +static void discard_from_brd(struct brd_device *brd, + sector_t sector, size_t n) +{ + while (n >= PAGE_SIZE) { + /* + * Don't want to actually discard pages here because + * re-allocating the pages can result in writeback + * deadlocks under heavy load. + */ + if (0) + brd_free_page(brd, sector); + else + brd_zero_page(brd, sector); + sector += PAGE_SIZE >> SECTOR_SHIFT; + n -= PAGE_SIZE; + } +} + /* * Copy n bytes from src to the brd starting at sector. Does not sleep. */ @@ -300,6 +340,12 @@ static int brd_make_request(struct request_queue *q, struct bio *bio) get_capacity(bdev->bd_disk)) goto out; + if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) { + err = 0; + discard_from_brd(brd, sector, bio->bi_size); + goto out; + } + rw = bio_rw(bio); if (rw == READA) rw = READ; @@ -320,7 +366,7 @@ out: } #ifdef CONFIG_BLK_DEV_XIP -static int brd_direct_access (struct block_device *bdev, sector_t sector, +static int brd_direct_access(struct block_device *bdev, sector_t sector, void **kaddr, unsigned long *pfn) { struct brd_device *brd = bdev->bd_disk->private_data; @@ -437,6 +483,11 @@ static struct brd_device *brd_alloc(int i) blk_queue_max_hw_sectors(brd->brd_queue, 1024); blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); + brd->brd_queue->limits.discard_granularity = PAGE_SIZE; + brd->brd_queue->limits.max_discard_sectors = UINT_MAX; + brd->brd_queue->limits.discard_zeroes_data = 1; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue); + disk = brd->brd_disk = alloc_disk(1 << part_shift); if (!disk) goto out_free_queue; -- cgit v1.2.3 From 2c8d196759054b632788633b20e39167df36041d Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 25 May 2010 14:32:03 +0200 Subject: drbd: Revert "drbd: Create new current UUID as late as possible" The late-UUID writing is delayed until the next release. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 9 +-------- drivers/block/drbd/drbd_main.c | 34 ++++++---------------------------- drivers/block/drbd/drbd_receiver.c | 11 ----------- 3 files changed, 7 insertions(+), 47 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e9654c8d5b62..c194348a46ed 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -943,8 +943,7 @@ struct drbd_conf { struct drbd_work resync_work, unplug_work, md_sync_work, - delay_probe_work, - uuid_work; + delay_probe_work; struct timer_list resync_timer; struct timer_list md_sync_timer; struct timer_list delay_probe_timer; @@ -1069,7 +1068,6 @@ struct drbd_conf { struct timeval dps_time; /* delay-probes-start-time */ unsigned int dp_volume_last; /* send_cnt of last delay probe */ int c_sync_rate; /* current resync rate after delay_probe magic */ - atomic_t new_c_uuid; }; static inline struct drbd_conf *minor_to_mdev(unsigned int minor) @@ -2219,8 +2217,6 @@ static inline int __inc_ap_bio_cond(struct drbd_conf *mdev) return 0; if (test_bit(BITMAP_IO, &mdev->flags)) return 0; - if (atomic_read(&mdev->new_c_uuid)) - return 0; return 1; } @@ -2241,9 +2237,6 @@ static inline void inc_ap_bio(struct drbd_conf *mdev, int count) * to avoid races with the reconnect code, * we need to atomic_inc within the spinlock. */ - if (atomic_read(&mdev->new_c_uuid) && atomic_add_unless(&mdev->new_c_uuid, -1, 1)) - drbd_queue_work_front(&mdev->data.work, &mdev->uuid_work); - spin_lock_irq(&mdev->req_lock); while (!__inc_ap_bio_cond(mdev)) { prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index be2d2da9cdba..1fcf2d1bcc39 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -1215,18 +1215,17 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, ns.pdsk == D_OUTDATED)) { if (get_ldev(mdev)) { if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && - mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE && - !atomic_read(&mdev->new_c_uuid)) - atomic_set(&mdev->new_c_uuid, 2); + mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { + drbd_uuid_new_current(mdev); + drbd_send_uuids(mdev); + } put_ldev(mdev); } } if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { - /* Diskless peer becomes primary or got connected do diskless, primary peer. */ - if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0 && - !atomic_read(&mdev->new_c_uuid)) - atomic_set(&mdev->new_c_uuid, 2); + if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) + drbd_uuid_new_current(mdev); /* D_DISKLESS Peer becomes secondary */ if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) @@ -1350,24 +1349,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, drbd_md_sync(mdev); } -static int w_new_current_uuid(struct drbd_conf *mdev, struct drbd_work *w, int cancel) -{ - if (get_ldev(mdev)) { - if (mdev->ldev->md.uuid[UI_BITMAP] == 0) { - drbd_uuid_new_current(mdev); - if (get_net_conf(mdev)) { - drbd_send_uuids(mdev); - put_net_conf(mdev); - } - drbd_md_sync(mdev); - } - put_ldev(mdev); - } - atomic_dec(&mdev->new_c_uuid); - wake_up(&mdev->misc_wait); - - return 1; -} static int drbd_thread_setup(void *arg) { @@ -2708,7 +2689,6 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) atomic_set(&mdev->net_cnt, 0); atomic_set(&mdev->packet_seq, 0); atomic_set(&mdev->pp_in_use, 0); - atomic_set(&mdev->new_c_uuid, 0); mutex_init(&mdev->md_io_mutex); mutex_init(&mdev->data.mutex); @@ -2739,14 +2719,12 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) INIT_LIST_HEAD(&mdev->bm_io_work.w.list); INIT_LIST_HEAD(&mdev->delay_probes); INIT_LIST_HEAD(&mdev->delay_probe_work.list); - INIT_LIST_HEAD(&mdev->uuid_work.list); mdev->resync_work.cb = w_resync_inactive; mdev->unplug_work.cb = w_send_write_hint; mdev->md_sync_work.cb = w_md_sync; mdev->bm_io_work.w.cb = w_bitmap_io; mdev->delay_probe_work.cb = w_delay_probes; - mdev->uuid_work.cb = w_new_current_uuid; init_timer(&mdev->resync_timer); init_timer(&mdev->md_sync_timer); init_timer(&mdev->delay_probe_timer); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index bc9ab7fb2cc7..a56340dd5710 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1154,17 +1154,6 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, unsigned n_bios = 0; unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT; - if (atomic_read(&mdev->new_c_uuid)) { - if (atomic_add_unless(&mdev->new_c_uuid, -1, 1)) { - drbd_uuid_new_current(mdev); - drbd_md_sync(mdev); - - atomic_dec(&mdev->new_c_uuid); - wake_up(&mdev->misc_wait); - } - wait_event(mdev->misc_wait, !atomic_read(&mdev->new_c_uuid)); - } - /* In most cases, we will only need one bio. But in case the lower * level restrictions happen to be different at this offset on this * side than those of the sending peer, we may need to submit the -- cgit v1.2.3 From 344fa462e3246bd102059ccc3c59deef416676dd Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 25 May 2010 14:23:57 +0200 Subject: drbd: improve network latency, TCP_QUICKACK On Thu, Apr 29, 2010 at 04:00:50PM -0400, Eduard.Guzovsky@stratus.com wrote on drbd-dev@lists.linbit.com Subject: [Drbd-dev] DRBD small synchronous writes performance improvements > 1. TCP_QUICKACK option is set incorrectly. The goal was force TCP to > send and ACK as a "one time" event. Instead the code permanently sets > connection in the QUICKACK mode. He is right, we actually want to use an even val with TCP_QUICKACK. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index c194348a46ed..6d79a76ba597 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1540,7 +1540,7 @@ static inline void drbd_tcp_nodelay(struct socket *sock) static inline void drbd_tcp_quickack(struct socket *sock) { - int __user val = 1; + int __user val = 2; (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, (char __user *)&val, sizeof(val)); } -- cgit v1.2.3 From 5dbf1673383f2f1554f0634fdfc390d59dc2c7d6 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 25 May 2010 16:18:01 +0200 Subject: drbd: need to set socket bufsize early to take effect quoting tcp(7): On individual connections, the socket buffer size must be set prior to the listen(2) or connect(2) calls in order to have it take effect. This adds a wrapper to do so, and uses it appropriately. Improves performance in certain situations. Note that because we cannot easily determine which socket will be "meta" and wich "data" (bulk) socket, we adjust both sockets. Previously, DRBD only adjusted the bufsizes of the "data" socket. Thanks again to Eduard.Guzovsky@stratus.com. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index a56340dd5710..f9a3e199e715 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -571,6 +571,25 @@ static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size) return rv; } +/* quoting tcp(7): + * On individual connections, the socket buffer size must be set prior to the + * listen(2) or connect(2) calls in order to have it take effect. + * This is our wrapper to do so. + */ +static void drbd_setbufsize(struct socket *sock, unsigned int snd, + unsigned int rcv) +{ + /* open coded SO_SNDBUF, SO_RCVBUF */ + if (snd) { + sock->sk->sk_sndbuf = snd; + sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + } + if (rcv) { + sock->sk->sk_rcvbuf = rcv; + sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + } +} + static struct socket *drbd_try_connect(struct drbd_conf *mdev) { const char *what; @@ -592,6 +611,8 @@ static struct socket *drbd_try_connect(struct drbd_conf *mdev) sock->sk->sk_rcvtimeo = sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; + drbd_setbufsize(sock, mdev->net_conf->sndbuf_size, + mdev->net_conf->rcvbuf_size); /* explicitly bind to the configured IP as source IP * for the outgoing connections. @@ -670,6 +691,8 @@ static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev) s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */ s_listen->sk->sk_rcvtimeo = timeo; s_listen->sk->sk_sndtimeo = timeo; + drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size, + mdev->net_conf->rcvbuf_size); what = "bind before listen"; err = s_listen->ops->bind(s_listen, @@ -856,16 +879,6 @@ retry: sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; msock->sk->sk_priority = TC_PRIO_INTERACTIVE; - if (mdev->net_conf->sndbuf_size) { - sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size; - sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - } - - if (mdev->net_conf->rcvbuf_size) { - sock->sk->sk_rcvbuf = mdev->net_conf->rcvbuf_size; - sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - } - /* NOT YET ... * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; -- cgit v1.2.3 From ba11ad9a3b9dd2dbb9c6686ea9d41a9a77d94327 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 25 May 2010 16:26:16 +0200 Subject: drbd: improve usage of MSG_MORE It seems to improve performance if we allow the "p_data" header in its own frame (no MSG_MORE), but sendpage all but the last page with MSG_MORE. This is also in preparation of a later zero copy receive implementation. Suggested by Eduard.Guzovsky@stratus.com on drbd-dev. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 1fcf2d1bcc39..c978557b4b80 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2272,9 +2272,9 @@ static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket * * with page_count == 0 or PageSlab. */ static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page, - int offset, size_t size) + int offset, size_t size, unsigned msg_flags) { - int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0); + int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags); kunmap(page); if (sent == size) mdev->send_cnt += size>>9; @@ -2282,7 +2282,7 @@ static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page, } static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, - int offset, size_t size) + int offset, size_t size, unsigned msg_flags) { mm_segment_t oldfs = get_fs(); int sent, ok; @@ -2295,14 +2295,15 @@ static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, * __page_cache_release a page that would actually still be referenced * by someone, leading to some obscure delayed Oops somewhere else. */ if (disable_sendpage || (page_count(page) < 1) || PageSlab(page)) - return _drbd_no_send_page(mdev, page, offset, size); + return _drbd_no_send_page(mdev, page, offset, size, msg_flags); + msg_flags |= MSG_NOSIGNAL; drbd_update_congested(mdev); set_fs(KERNEL_DS); do { sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page, offset, len, - MSG_NOSIGNAL); + msg_flags); if (sent == -EAGAIN) { if (we_should_drop_the_connection(mdev, mdev->data.socket)) @@ -2331,9 +2332,11 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) { struct bio_vec *bvec; int i; + /* hint all but last page with MSG_MORE */ __bio_for_each_segment(bvec, bio, i, 0) { if (!_drbd_no_send_page(mdev, bvec->bv_page, - bvec->bv_offset, bvec->bv_len)) + bvec->bv_offset, bvec->bv_len, + i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) return 0; } return 1; @@ -2343,12 +2346,13 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) { struct bio_vec *bvec; int i; + /* hint all but last page with MSG_MORE */ __bio_for_each_segment(bvec, bio, i, 0) { if (!_drbd_send_page(mdev, bvec->bv_page, - bvec->bv_offset, bvec->bv_len)) + bvec->bv_offset, bvec->bv_len, + i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) return 0; } - return 1; } @@ -2356,9 +2360,11 @@ static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) { struct page *page = e->pages; unsigned len = e->size; + /* hint all but last page with MSG_MORE */ page_chain_for_each(page) { unsigned l = min_t(unsigned, len, PAGE_SIZE); - if (!_drbd_send_page(mdev, page, 0, l)) + if (!_drbd_send_page(mdev, page, 0, l, + page_chain_next(page) ? MSG_MORE : 0)) return 0; len -= l; } @@ -2438,11 +2444,11 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) p.dp_flags = cpu_to_be32(dp_flags); set_bit(UNPLUG_REMOTE, &mdev->flags); ok = (sizeof(p) == - drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE)); + drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0)); if (ok && dgs) { dgb = mdev->int_dig_out; drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); - ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); + ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0); } if (ok) { if (mdev->net_conf->wire_protocol == DRBD_PROT_A) @@ -2491,11 +2497,11 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, return 0; ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, - sizeof(p), MSG_MORE); + sizeof(p), dgs ? MSG_MORE : 0); if (ok && dgs) { dgb = mdev->int_dig_out; drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb); - ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); + ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, 0); } if (ok) ok = _drbd_send_zc_ee(mdev, e); -- cgit v1.2.3 From 039e1fb65496636778e24c881a5e58ed7c39fbb3 Mon Sep 17 00:00:00 2001 From: Andrea Gelmini Date: Sun, 23 May 2010 21:48:13 +0200 Subject: drbd: removed duplicated #includes drbd/drbd_receiver.c: linux/mm.h is included more than once. Signed-off-by: Andrea Gelmini Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index f9a3e199e715..dff48701b84d 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include "drbd_int.h" -- cgit v1.2.3 From 32fa7e91f923d8b2578c42016ff3a94efc9968a2 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 26 May 2010 17:13:18 +0200 Subject: drbd: Removed the now empty w_io_error() function Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 1 - drivers/block/drbd/drbd_req.c | 27 +-------------------------- drivers/block/drbd/drbd_worker.c | 14 -------------- 3 files changed, 1 insertion(+), 41 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 6d79a76ba597..86605f6d0854 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1474,7 +1474,6 @@ extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int); extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int); extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int); extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int); -extern int w_io_error(struct drbd_conf *, struct drbd_work *, int); extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int); extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int); extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 3397f11d0ba9..e6c4d579eaba 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -102,32 +102,7 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const } } - /* if it was a local io error, we want to notify our - * peer about that, and see if we need to - * detach the disk and stuff. - * to avoid allocating some special work - * struct, reuse the request. */ - - /* THINK - * why do we do this not when we detect the error, - * but delay it until it is "done", i.e. possibly - * until the next barrier ack? */ - - if (rw == WRITE && - ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) { - if (!(req->w.list.next == LIST_POISON1 || - list_empty(&req->w.list))) { - /* DEBUG ASSERT only; if this triggers, we - * probably corrupt the worker list here */ - dev_err(DEV, "req->w.list.next = %p\n", req->w.list.next); - dev_err(DEV, "req->w.list.prev = %p\n", req->w.list.prev); - } - req->w.cb = w_io_error; - drbd_queue_work(&mdev->data.work, &req->w); - /* drbd_req_free() is done in w_io_error */ - } else { - drbd_req_free(req); - } + drbd_req_free(req); } static void queue_barrier(struct drbd_conf *mdev) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 727ff6339754..a12b447bafbd 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -257,20 +257,6 @@ void drbd_endio_pri(struct bio *bio, int error) complete_master_bio(mdev, &m); } -int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel) -{ - struct drbd_request *req = container_of(w, struct drbd_request, w); - - /* NOTE: mdev->ldev can be NULL by the time we get here! */ - /* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */ - - /* the only way this callback is scheduled is from _req_may_be_done, - * when it is done and had a local write error, see comments there */ - drbd_req_free(req); - - return TRUE; -} - int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { struct drbd_request *req = container_of(w, struct drbd_request, w); -- cgit v1.2.3 From d255e5ff5fc6cc6c60dd014d1261448a7bbc8134 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 27 May 2010 09:45:45 +0200 Subject: drbd: fix hang on local read errors while disconnected "canceled" w_read_retry_remote never completed, if they have been canceled after drbd_disconnect connection teardown cleanup has already run (or we are currently not connected anyways). Fixed by not queueing a remote retry if we already know it won't work (pdsk not uptodate), and cleanup ourselves on "cancel", in case we hit a race with drbd_disconnect. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_req.c | 22 +++++++++++++--------- drivers/block/drbd/drbd_req.h | 1 + drivers/block/drbd/drbd_worker.c | 6 ++---- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index e6c4d579eaba..8915644af722 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -452,20 +452,21 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, dev_alert(DEV, "Local READ failed sec=%llus size=%u\n", (unsigned long long)req->sector, req->size); - /* _req_mod(req,to_be_send); oops, recursion... */ D_ASSERT(!(req->rq_state & RQ_NET_MASK)); - req->rq_state |= RQ_NET_PENDING; - inc_ap_pending(mdev); __drbd_chk_io_error(mdev, FALSE); put_ldev(mdev); - /* NOTE: if we have no connection, - * or know the peer has no good data either, - * then we don't actually need to "queue_for_net_read", - * but we do so anyways, since the drbd_io_error() - * and the potential state change to "Diskless" - * needs to be done from process context */ + /* no point in retrying if there is no good remote data, + * or we have no connection. */ + if (mdev->state.pdsk != D_UP_TO_DATE) { + _req_may_be_done(req, m); + break; + } + + /* _req_mod(req,to_be_send); oops, recursion... */ + req->rq_state |= RQ_NET_PENDING; + inc_ap_pending(mdev); /* fall through: _req_mod(req,queue_for_net_read); */ case queue_for_net_read: @@ -575,6 +576,9 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, _req_may_be_done(req, m); break; + case read_retry_remote_canceled: + req->rq_state &= ~RQ_NET_QUEUED; + /* fall through, in case we raced with drbd_disconnect */ case connection_lost_while_pending: /* transfer log cleanup after connection loss */ /* assert something? */ diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 16119d7056cc..02d575d24518 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -91,6 +91,7 @@ enum drbd_req_event { send_failed, handed_over_to_network, connection_lost_while_pending, + read_retry_remote_canceled, recv_acked_by_peer, write_acked_by_peer, write_acked_by_peer_and_sis, /* and set_in_sync */ diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index a12b447bafbd..67371fcbc5aa 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -266,10 +266,8 @@ int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) * to give the disk the chance to relocate that block */ spin_lock_irq(&mdev->req_lock); - if (cancel || - mdev->state.conn < C_CONNECTED || - mdev->state.pdsk <= D_INCONSISTENT) { - _req_mod(req, send_canceled); + if (cancel || mdev->state.pdsk != D_UP_TO_DATE) { + _req_mod(req, read_retry_remote_canceled); spin_unlock_irq(&mdev->req_lock); dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n"); return 1; -- cgit v1.2.3 From 7383506c87237dbd627f0b8b72b50117f25c5ca2 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Thu, 27 May 2010 11:51:56 +0200 Subject: drbd: use drbd specific ratelimit instead of global printk_ratelimit using the global printk_ratelimit() may mask other messages. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 2 +- drivers/block/drbd/drbd_main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 86605f6d0854..485ed8c7d623 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1725,7 +1725,7 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, switch (mdev->ldev->dc.on_io_error) { case EP_PASS_ON: if (!forcedetach) { - if (printk_ratelimit()) + if (__ratelimit(&drbd_ratelimit_state)) dev_err(DEV, "Local IO failed in %s." "Passing error on...\n", where); break; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index c978557b4b80..6b077f93acc6 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3783,7 +3783,7 @@ _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) if (ret) { fault_count++; - if (printk_ratelimit()) + if (__ratelimit(&drbd_ratelimit_state)) dev_warn(DEV, "***Simulating %s failure\n", _drbd_fault_str(type)); } -- cgit v1.2.3 From 2a0ab2cd73c26835e635ed4e3868f983519048fb Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 26 May 2010 17:59:55 +0200 Subject: drbd: Reduce verbosity The "Local READ/WRITE failed" messages are too verbose. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_req.c | 5 ----- drivers/block/drbd/drbd_worker.c | 4 ---- 2 files changed, 9 deletions(-) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 8915644af722..654f1ef5cbb0 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -428,9 +428,6 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; - dev_alert(DEV, "Local WRITE failed sec=%llus size=%u\n", - (unsigned long long)req->sector, req->size); - /* and now: check how to handle local io error. */ __drbd_chk_io_error(mdev, FALSE); _req_may_be_done(req, m); put_ldev(mdev); @@ -450,8 +447,6 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; - dev_alert(DEV, "Local READ failed sec=%llus size=%u\n", - (unsigned long long)req->sector, req->size); D_ASSERT(!(req->rq_state & RQ_NET_MASK)); __drbd_chk_io_error(mdev, FALSE); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 67371fcbc5aa..b623ceee2a4a 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -224,9 +224,6 @@ void drbd_endio_pri(struct bio *bio, int error) enum drbd_req_event what; int uptodate = bio_flagged(bio, BIO_UPTODATE); - if (error) - dev_warn(DEV, "p %s: error=%d\n", - bio_data_dir(bio) == WRITE ? "write" : "read", error); if (!error && !uptodate) { dev_warn(DEV, "p %s: setting error to -EIO\n", bio_data_dir(bio) == WRITE ? "write" : "read"); @@ -269,7 +266,6 @@ int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) if (cancel || mdev->state.pdsk != D_UP_TO_DATE) { _req_mod(req, read_retry_remote_canceled); spin_unlock_irq(&mdev->req_lock); - dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n"); return 1; } spin_unlock_irq(&mdev->req_lock); -- cgit v1.2.3 From 099c5c310e9744bd0654881bb55c137051228e56 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Thu, 27 May 2010 13:46:35 +0200 Subject: Preparing 8.3.8rc2 Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- include/linux/drbd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 68530521ad00..30da4ae48972 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -53,7 +53,7 @@ extern const char *drbd_buildtag(void); -#define REL_VERSION "8.3.8rc1" +#define REL_VERSION "8.3.8rc2" #define API_VERSION 88 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 94 -- cgit v1.2.3 From 713b686494a577b3c4f4f9f585a4705fc30d51c2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 1 Jun 2010 12:17:48 +0200 Subject: cciss: call BUG() earlier I moved the range check after the increment. The current code would write past the end of the array once before calling BUG(). Signed-off-by: Dan Carpenter Signed-off-by: Jens Axboe --- drivers/block/cciss_scsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c index e1d0e2cfec72..3381505c8a6c 100644 --- a/drivers/block/cciss_scsi.c +++ b/drivers/block/cciss_scsi.c @@ -188,11 +188,11 @@ scsi_cmd_free(ctlr_info_t *h, CommandList_struct *cmd) sa = h->scsi_ctlr; stk = &sa->cmd_stack; + stk->top++; if (stk->top >= CMD_STACK_SIZE) { printk("cciss: scsi_cmd_free called too many times.\n"); BUG(); } - stk->top++; stk->elem[stk->top] = (struct cciss_scsi_cmd_stack_elem_t *) cmd; } -- cgit v1.2.3 From 28f4197e5d4707311febeec8a0eb97cb5fd93c97 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 1 Jun 2010 12:23:18 +0200 Subject: block: disable preemption before using sched_clock() Commit 9195291e5f05e01d67f9a09c756b8aca8f009089 added calls to sched_clock() from preemptible code. sched_clock() is both the wrong interface AND cannot be called without preempt disabled. Apply a temporary fix to get rid of the warnings, a real patch is in the works. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8b7f5e0914ad..09a840264d6f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1211,14 +1211,23 @@ struct work_struct; int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); #ifdef CONFIG_BLK_CGROUP +/* + * This should not be using sched_clock(). A real patch is in progress + * to fix this up, until that is in place we need to disable preemption + * around sched_clock() in this function and set_io_start_time_ns(). + */ static inline void set_start_time_ns(struct request *req) { + preempt_disable(); req->start_time_ns = sched_clock(); + preempt_enable(); } static inline void set_io_start_time_ns(struct request *req) { + preempt_disable(); req->io_start_time_ns = sched_clock(); + preempt_enable(); } static inline uint64_t rq_start_time_ns(struct request *req) -- cgit v1.2.3 From 6a6ca57de92fcae34603551ac944aa74758c30d4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 3 Jun 2010 12:44:30 +0200 Subject: pipe: adjust minimum pipe size to 1 page We don't need to pages to guarantee the POSIX requirement that upto a page size write must be atomic to an empty pipe. Signed-off-by: Jens Axboe --- fs/pipe.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 541d6626f9d9..369a0245aab6 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1181,13 +1181,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_ADMIN) && nr_pages > pipe_max_pages) { ret = -EPERM; goto out; - } - - /* - * The pipe needs to be at least 2 pages large to - * guarantee POSIX behaviour. - */ - if (arg < 2) { + } else if (nr_pages < 1) { ret = -EINVAL; goto out; } -- cgit v1.2.3 From 419f8367ea37e5adc5d95479e8fd5554b92b49fe Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 3 Jun 2010 12:45:28 +0200 Subject: pipe: change the privilege required for growing a pipe beyond system max Change it to CAP_SYS_RESOURCE, as that more accurately models what we want to control. Suggested-by: Michael Kerrisk Signed-off-by: Jens Axboe --- fs/pipe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/pipe.c b/fs/pipe.c index 369a0245aab6..f98fae3e36b0 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1178,7 +1178,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) nr_pages = (arg + PAGE_SIZE - 1) >> PAGE_SHIFT; nr_pages = roundup_pow_of_two(nr_pages); - if (!capable(CAP_SYS_ADMIN) && nr_pages > pipe_max_pages) { + if (!capable(CAP_SYS_RESOURCE) && nr_pages > pipe_max_pages) { ret = -EPERM; goto out; } else if (nr_pages < 1) { -- cgit v1.2.3 From ff9da691c0498ff81fdd014e7a0731dab2337dac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 3 Jun 2010 14:54:39 +0200 Subject: pipe: change /proc/sys/fs/pipe-max-pages to byte sized interface This changes the interface to be based on bytes instead. The API matches that of F_SETPIPE_SZ in that it rounds up the passed in size so that the resulting page array is a power-of-2 in size. The proc file is renamed to /proc/sys/fs/pipe-max-size to reflect this change. Signed-off-by: Jens Axboe --- fs/pipe.c | 54 ++++++++++++++++++++++++++++++++++++----------- include/linux/pipe_fs_i.h | 4 +++- kernel/sysctl.c | 8 +++---- 3 files changed, 49 insertions(+), 17 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index f98fae3e36b0..69c4c7c13ea9 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -26,9 +26,14 @@ /* * The max size that a non-root user is allowed to grow the pipe. Can - * be set by root in /proc/sys/fs/pipe-max-pages + * be set by root in /proc/sys/fs/pipe-max-size */ -unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16; +unsigned int pipe_max_size = 1048576; + +/* + * Minimum pipe size, as required by POSIX + */ +unsigned int pipe_min_size = PAGE_SIZE; /* * We use a start+len construction, which provides full use of the @@ -1156,6 +1161,35 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) return nr_pages * PAGE_SIZE; } +/* + * Currently we rely on the pipe array holding a power-of-2 number + * of pages. + */ +static inline unsigned int round_pipe_size(unsigned int size) +{ + unsigned long nr_pages; + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; +} + +/* + * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax + * will return an error. + */ +int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, + size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); + if (ret < 0 || !write) + return ret; + + pipe_max_size = round_pipe_size(pipe_max_size); + return ret; +} + long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe; @@ -1169,23 +1203,19 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) switch (cmd) { case F_SETPIPE_SZ: { - unsigned long nr_pages; + unsigned int size, nr_pages; - /* - * Currently the array must be a power-of-2 size, so adjust - * upwards if needed. - */ - nr_pages = (arg + PAGE_SIZE - 1) >> PAGE_SHIFT; - nr_pages = roundup_pow_of_two(nr_pages); + size = round_pipe_size(arg); + nr_pages = size >> PAGE_SHIFT; - if (!capable(CAP_SYS_RESOURCE) && nr_pages > pipe_max_pages) { + if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { ret = -EPERM; goto out; - } else if (nr_pages < 1) { + } else if (nr_pages < PAGE_SIZE) { ret = -EINVAL; goto out; } - ret = pipe_set_size(pipe, arg); + ret = pipe_set_size(pipe, nr_pages); break; } case F_GETPIPE_SZ: diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 16de3933c45e..445796945ac9 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -139,7 +139,9 @@ void pipe_lock(struct pipe_inode_info *); void pipe_unlock(struct pipe_inode_info *); void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); -extern unsigned int pipe_max_pages; +extern unsigned int pipe_max_size, pipe_min_size; +int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *); + /* Drop the inode semaphore and wait for a pipe event, atomically */ void pipe_wait(struct pipe_inode_info *pipe); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 997080f00e0b..d24f761f4876 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1471,12 +1471,12 @@ static struct ctl_table fs_table[] = { }, #endif { - .procname = "pipe-max-pages", - .data = &pipe_max_pages, + .procname = "pipe-max-size", + .data = &pipe_max_size, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = &two, + .proc_handler = &pipe_proc_fn, + .extra1 = &pipe_min_size, }, /* * NOTE: do not add new entries to this table unless you have read -- cgit v1.2.3 From c86d1b8ae622e1ea5d20e98bd72fbd7d9dd69016 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 3 Jun 2010 11:34:52 -0600 Subject: block: avoid unconditionally freeing previously allocated request_queue On blk_init_allocated_queue_node failure, only free the request_queue if it is wasn't previously allocated outside the block layer (e.g. blk_init_queue_node was blk_init_allocated_queue_node caller). This addresses an interface bug introduced by the following commit: 01effb0 block: allow initialization of previously allocated request_queue Otherwise the request_queue may be free'd out from underneath a caller that is managing the request_queue directly (e.g. caller uses blk_alloc_queue + blk_init_allocated_queue_node). Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-core.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 3bc5579d6f54..826d07078902 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -570,9 +570,17 @@ EXPORT_SYMBOL(blk_init_queue); struct request_queue * blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) { - struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); + struct request_queue *uninit_q, *q; - return blk_init_allocated_queue_node(q, rfn, lock, node_id); + uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id); + if (!uninit_q) + return NULL; + + q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id); + if (!q) + blk_cleanup_queue(uninit_q); + + return q; } EXPORT_SYMBOL(blk_init_queue_node); @@ -592,10 +600,8 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn, return NULL; q->node = node_id; - if (blk_init_free_list(q)) { - kmem_cache_free(blk_requestq_cachep, q); + if (blk_init_free_list(q)) return NULL; - } q->request_fn = rfn; q->prep_rq_fn = NULL; @@ -618,7 +624,6 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn, return q; } - blk_put_queue(q); return NULL; } EXPORT_SYMBOL(blk_init_allocated_queue_node); -- cgit v1.2.3 From 1abec4fdbb142e3ccb6ce99832fae42129134a96 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 25 May 2010 13:15:15 -0400 Subject: block: make blk_init_free_list and elevator_init idempotent blk_init_allocated_queue_node may fail and the caller _could_ retry. Accommodate the unlikely event that blk_init_allocated_queue_node is called on an already initialized (possibly partially) request_queue. Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-core.c | 3 +++ block/elevator.c | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 826d07078902..f84cce42fc58 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -467,6 +467,9 @@ static int blk_init_free_list(struct request_queue *q) { struct request_list *rl = &q->rq; + if (unlikely(rl->rq_pool)) + return 0; + rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; rl->elvpriv = 0; diff --git a/block/elevator.c b/block/elevator.c index 0abce473d606..923a9139106c 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -242,9 +242,11 @@ int elevator_init(struct request_queue *q, char *name) { struct elevator_type *e = NULL; struct elevator_queue *eq; - int ret = 0; void *data; + if (unlikely(q->elevator)) + return 0; + INIT_LIST_HEAD(&q->queue_head); q->last_merge = NULL; q->end_sector = 0; @@ -284,7 +286,7 @@ int elevator_init(struct request_queue *q, char *name) } elevator_attach(q, eq, data); - return ret; + return 0; } EXPORT_SYMBOL(elevator_init); -- cgit v1.2.3