summaryrefslogtreecommitdiffstats
path: root/fs/block_dev.c
diff options
context:
space:
mode:
authorMikulas Patocka <mpatocka@redhat.com>2012-09-26 07:46:43 +0200
committerJens Axboe <axboe@kernel.dk>2012-09-26 07:46:43 +0200
commit62ac665ff9fc07497ca524bd20d6a96893d11071 (patch)
treedfd697e488fde4b46f1cb2ebfb380bb881115827 /fs/block_dev.c
parentb87570f5d349661814b262dd5fc40787700f80d6 (diff)
downloadlinux-62ac665ff9fc07497ca524bd20d6a96893d11071.tar.gz
linux-62ac665ff9fc07497ca524bd20d6a96893d11071.tar.bz2
linux-62ac665ff9fc07497ca524bd20d6a96893d11071.zip
blockdev: turn a rw semaphore into a percpu rw semaphore
This avoids cache line bouncing when many processes lock the semaphore for read. New percpu lock implementation The lock consists of an array of percpu unsigned integers, a boolean variable and a mutex. When we take the lock for read, we enter rcu read section, check for a "locked" variable. If it is false, we increase a percpu counter on the current cpu and exit the rcu section. If "locked" is true, we exit the rcu section, take the mutex and drop it (this waits until a writer finished) and retry. Unlocking for read just decreases percpu variable. Note that we can unlock on a difference cpu than where we locked, in this case the counter underflows. The sum of all percpu counters represents the number of processes that hold the lock for read. When we need to lock for write, we take the mutex, set "locked" variable to true and synchronize rcu. Since RCU has been synchronized, no processes can create new read locks. We wait until the sum of percpu counters is zero - when it is, there are no readers in the critical section. Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'fs/block_dev.c')
-rw-r--r--fs/block_dev.c27
1 files changed, 17 insertions, 10 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index cdfb625824e2..7eeb0635338b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -127,7 +127,7 @@ int set_blocksize(struct block_device *bdev, int size)
return -EINVAL;
/* Prevent starting I/O or mapping the device */
- down_write(&bdev->bd_block_size_semaphore);
+ percpu_down_write(&bdev->bd_block_size_semaphore);
/* Check that the block device is not memory mapped */
mapping = bdev->bd_inode->i_mapping;
@@ -135,7 +135,7 @@ int set_blocksize(struct block_device *bdev, int size)
if (!prio_tree_empty(&mapping->i_mmap) ||
!list_empty(&mapping->i_mmap_nonlinear)) {
mutex_unlock(&mapping->i_mmap_mutex);
- up_write(&bdev->bd_block_size_semaphore);
+ percpu_up_write(&bdev->bd_block_size_semaphore);
return -EBUSY;
}
mutex_unlock(&mapping->i_mmap_mutex);
@@ -148,7 +148,7 @@ int set_blocksize(struct block_device *bdev, int size)
kill_bdev(bdev);
}
- up_write(&bdev->bd_block_size_semaphore);
+ percpu_up_write(&bdev->bd_block_size_semaphore);
return 0;
}
@@ -460,6 +460,12 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
if (!ei)
return NULL;
+
+ if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
+ kmem_cache_free(bdev_cachep, ei);
+ return NULL;
+ }
+
return &ei->vfs_inode;
}
@@ -468,6 +474,8 @@ static void bdev_i_callback(struct rcu_head *head)
struct inode *inode = container_of(head, struct inode, i_rcu);
struct bdev_inode *bdi = BDEV_I(inode);
+ percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
+
kmem_cache_free(bdev_cachep, bdi);
}
@@ -491,7 +499,6 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
/* Initialize mutex for freeze. */
mutex_init(&bdev->bd_fsfreeze_mutex);
- init_rwsem(&bdev->bd_block_size_semaphore);
}
static inline void __bd_forget(struct inode *inode)
@@ -1593,11 +1600,11 @@ ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
ssize_t ret;
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
- down_read(&bdev->bd_block_size_semaphore);
+ percpu_down_read(&bdev->bd_block_size_semaphore);
ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
- up_read(&bdev->bd_block_size_semaphore);
+ percpu_up_read(&bdev->bd_block_size_semaphore);
return ret;
}
@@ -1622,7 +1629,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
blk_start_plug(&plug);
- down_read(&bdev->bd_block_size_semaphore);
+ percpu_down_read(&bdev->bd_block_size_semaphore);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
if (ret > 0 || ret == -EIOCBQUEUED) {
@@ -1633,7 +1640,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
ret = err;
}
- up_read(&bdev->bd_block_size_semaphore);
+ percpu_up_read(&bdev->bd_block_size_semaphore);
blk_finish_plug(&plug);
@@ -1646,11 +1653,11 @@ int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
int ret;
struct block_device *bdev = I_BDEV(file->f_mapping->host);
- down_read(&bdev->bd_block_size_semaphore);
+ percpu_down_read(&bdev->bd_block_size_semaphore);
ret = generic_file_mmap(file, vma);
- up_read(&bdev->bd_block_size_semaphore);
+ percpu_up_read(&bdev->bd_block_size_semaphore);
return ret;
}