From 09d967c6f32b35eab15b45862ae16e4f06259d8e Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Thu, 22 Jun 2006 14:47:21 -0700 Subject: [PATCH] Fix a race condition between ->i_mapping and iput() This race became a cause of oops, and can reproduce by the following. while true; do dd if=/dev/zero of=/dev/.static/dev/hdg1 bs=512 count=1000 & sync done This race condition was between __sync_single_inode() and iput(). cpu0 (fs's inode) cpu1 (bdev's inode) ----------------- ------------------- close("/dev/hda2") [...] __sync_single_inode() /* copy the bdev's ->i_mapping */ mapping = inode->i_mapping; generic_forget_inode() bdev_clear_inode() /* restre the fs's ->i_mapping */ inode->i_mapping = &inode->i_data; /* bdev's inode was freed */ destroy_inode(inode); if (wait) { /* dereference a freed bdev's mapping->host */ filemap_fdatawait(mapping); /* Oops */ Since __sync_single_inode() is only taking a ref-count of fs's inode, the another process can be close() and freeing the bdev's inode while writing fs's inode. So, __sync_signle_inode() accesses the freed ->i_mapping, oops. This patch takes a ref-count on the bdev's inode for the fs's inode before setting a ->i_mapping, and the clear_inode() of the fs's inode does iput() on the bdev's inode. So if the fs's inode is still living, bdev's inode shouldn't be freed. Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index f5958f413bd1..44aaba202f78 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -414,21 +414,31 @@ EXPORT_SYMBOL(bdput); static struct block_device *bd_acquire(struct inode *inode) { struct block_device *bdev; + spin_lock(&bdev_lock); bdev = inode->i_bdev; - if (bdev && igrab(bdev->bd_inode)) { + if (bdev) { + atomic_inc(&bdev->bd_inode->i_count); spin_unlock(&bdev_lock); return bdev; } spin_unlock(&bdev_lock); + bdev = bdget(inode->i_rdev); if (bdev) { spin_lock(&bdev_lock); - if (inode->i_bdev) - __bd_forget(inode); - inode->i_bdev = bdev; - inode->i_mapping = bdev->bd_inode->i_mapping; - list_add(&inode->i_devices, &bdev->bd_inodes); + if (!inode->i_bdev) { + /* + * We take an additional bd_inode->i_count for inode, + * and it's released in clear_inode() of inode. + * So, we can access it via ->i_mapping always + * without igrab(). + */ + atomic_inc(&bdev->bd_inode->i_count); + inode->i_bdev = bdev; + inode->i_mapping = bdev->bd_inode->i_mapping; + list_add(&inode->i_devices, &bdev->bd_inodes); + } spin_unlock(&bdev_lock); } return bdev; @@ -438,10 +448,18 @@ static struct block_device *bd_acquire(struct inode *inode) void bd_forget(struct inode *inode) { + struct block_device *bdev = NULL; + spin_lock(&bdev_lock); - if (inode->i_bdev) + if (inode->i_bdev) { + if (inode->i_sb != blockdev_superblock) + bdev = inode->i_bdev; __bd_forget(inode); + } spin_unlock(&bdev_lock); + + if (bdev) + iput(bdev->bd_inode); } int bd_claim(struct block_device *bdev, void *holder) -- cgit v1.2.3 From 454e2398be9b9fa30433fccc548db34d19aa9958 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Jun 2006 02:02:57 -0700 Subject: [PATCH] VFS: Permit filesystem to override root dentry on mount Extend the get_sb() filesystem operation to take an extra argument that permits the VFS to pass in the target vfsmount that defines the mountpoint. The filesystem is then required to manually set the superblock and root dentry pointers. For most filesystems, this should be done with simple_set_mnt() which will set the superblock pointer and then set the root dentry to the superblock's s_root (as per the old default behaviour). The get_sb() op now returns an integer as there's now no need to return the superblock pointer. This patch permits a superblock to be implicitly shared amongst several mount points, such as can be done with NFS to avoid potential inode aliasing. In such a case, simple_set_mnt() would not be called, and instead the mnt_root and mnt_sb would be set directly. The patch also makes the following changes: (*) the get_sb_*() convenience functions in the core kernel now take a vfsmount pointer argument and return an integer, so most filesystems have to change very little. (*) If one of the convenience function is not used, then get_sb() should normally call simple_set_mnt() to instantiate the vfsmount. This will always return 0, and so can be tail-called from get_sb(). (*) generic_shutdown_super() now calls shrink_dcache_sb() to clean up the dcache upon superblock destruction rather than shrink_dcache_anon(). This is required because the superblock may now have multiple trees that aren't actually bound to s_root, but that still need to be cleaned up. The currently called functions assume that the whole tree is rooted at s_root, and that anonymous dentries are not the roots of trees which results in dentries being left unculled. However, with the way NFS superblock sharing are currently set to be implemented, these assumptions are violated: the root of the filesystem is simply a dummy dentry and inode (the real inode for '/' may well be inaccessible), and all the vfsmounts are rooted on anonymous[*] dentries with child trees. [*] Anonymous until discovered from another tree. (*) The documentation has been adjusted, including the additional bit of changing ext2_* into foo_* in the documentation. [akpm@osdl.org: convert ipath_fs, do other stuff] Signed-off-by: David Howells Acked-by: Al Viro Cc: Nathan Scott Cc: Roland Dreier Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/block_dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index 44aaba202f78..028d9fb9c2d5 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -300,10 +300,10 @@ static struct super_operations bdev_sops = { .clear_inode = bdev_clear_inode, }; -static struct super_block *bd_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int bd_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576); + return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt); } static struct file_system_type bd_type = { -- cgit v1.2.3