From 4aede84b33d6beb401136a3deca0651ae07c5e99 Mon Sep 17 00:00:00 2001 From: Justin TerAvest Date: Tue, 12 Jul 2011 08:31:45 +0200 Subject: fixlet: Remove fs_excl from struct task. fs_excl is a poor man's priority inheritance for filesystems to hint to the block layer that an operation is important. It was never clearly specified, not widely adopted, and will not prevent starvation in many cases (like across cgroups). fs_excl was introduced with the time sliced CFQ IO scheduler, to indicate when a process held FS exclusive resources and thus needed a boost. It doesn't cover all file systems, and it was never fully complete. Lets kill it. Signed-off-by: Justin TerAvest Signed-off-by: Jens Axboe --- fs/super.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index ab3d672db0de..cf12ba50973b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -245,13 +245,11 @@ static int grab_super(struct super_block *s) __releases(sb_lock) */ void lock_super(struct super_block * sb) { - get_fs_excl(); mutex_lock(&sb->s_lock); } void unlock_super(struct super_block * sb) { - put_fs_excl(); mutex_unlock(&sb->s_lock); } @@ -280,7 +278,6 @@ void generic_shutdown_super(struct super_block *sb) if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); - get_fs_excl(); sb->s_flags &= ~MS_ACTIVE; fsnotify_unmount_inodes(&sb->s_inodes); @@ -295,7 +292,6 @@ void generic_shutdown_super(struct super_block *sb) "Self-destruct in 5 seconds. Have a nice day...\n", sb->s_id); } - put_fs_excl(); } spin_lock(&sb_lock); /* should be initialized for __put_super_and_need_restart() */ -- cgit v1.2.3 From 43e15cdbefea4ce6d68113de98d4f61c0cf45687 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 3 Jun 2011 20:16:57 -0400 Subject: new helper: iterate_supers_type() Call the given function for all superblocks of given type. Function gets a superblock (with s_umount locked shared) and (void *) argument supplied by caller of iterator. Signed-off-by: Al Viro --- fs/super.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index ab3d672db0de..444da9579068 100644 --- a/fs/super.c +++ b/fs/super.c @@ -451,6 +451,42 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) spin_unlock(&sb_lock); } +/** + * iterate_supers_type - call function for superblocks of given type + * @type: fs type + * @f: function to call + * @arg: argument to pass to it + * + * Scans the superblock list and calls given function, passing it + * locked superblock and given argument. + */ +void iterate_supers_type(struct file_system_type *type, + void (*f)(struct super_block *, void *), void *arg) +{ + struct super_block *sb, *p = NULL; + + spin_lock(&sb_lock); + list_for_each_entry(sb, &type->fs_supers, s_instances) { + sb->s_count++; + spin_unlock(&sb_lock); + + down_read(&sb->s_umount); + if (sb->s_root) + f(sb, arg); + up_read(&sb->s_umount); + + spin_lock(&sb_lock); + if (p) + __put_super(p); + p = sb; + } + if (p) + __put_super(p); + spin_unlock(&sb_lock); +} + +EXPORT_SYMBOL(iterate_supers_type); + /** * get_super - get the superblock of a device * @bdev: device to get the superblock for -- cgit v1.2.3 From 0ee5dc676a5f8fadede608c7281dfedb1ae714ea Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 Jul 2011 15:44:25 -0400 Subject: btrfs: kill magical embedded struct superblock Signed-off-by: Al Viro --- fs/super.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index 444da9579068..263edeb9f0e9 100644 --- a/fs/super.c +++ b/fs/super.c @@ -693,7 +693,7 @@ static DEFINE_IDA(unnamed_dev_ida); static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ static int unnamed_dev_start = 0; /* don't bother trying below it */ -int set_anon_super(struct super_block *s, void *data) +int get_anon_bdev(dev_t *p) { int dev; int error; @@ -720,24 +720,38 @@ int set_anon_super(struct super_block *s, void *data) spin_unlock(&unnamed_dev_lock); return -EMFILE; } - s->s_dev = MKDEV(0, dev & MINORMASK); - s->s_bdi = &noop_backing_dev_info; + *p = MKDEV(0, dev & MINORMASK); return 0; } +EXPORT_SYMBOL(get_anon_bdev); -EXPORT_SYMBOL(set_anon_super); - -void kill_anon_super(struct super_block *sb) +void free_anon_bdev(dev_t dev) { - int slot = MINOR(sb->s_dev); - - generic_shutdown_super(sb); + int slot = MINOR(dev); spin_lock(&unnamed_dev_lock); ida_remove(&unnamed_dev_ida, slot); if (slot < unnamed_dev_start) unnamed_dev_start = slot; spin_unlock(&unnamed_dev_lock); } +EXPORT_SYMBOL(free_anon_bdev); + +int set_anon_super(struct super_block *s, void *data) +{ + int error = get_anon_bdev(&s->s_dev); + if (!error) + s->s_bdi = &noop_backing_dev_info; + return error; +} + +EXPORT_SYMBOL(set_anon_super); + +void kill_anon_super(struct super_block *sb) +{ + dev_t dev = sb->s_dev; + generic_shutdown_super(sb); + free_anon_bdev(dev); +} EXPORT_SYMBOL(kill_anon_super); -- cgit v1.2.3 From 98b745c647a5a90c3c21ea43cbfad9a47b0dfad7 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:39 +1000 Subject: inode: Make unused inode LRU per superblock The inode unused list is currently a global LRU. This does not match the other global filesystem cache - the dentry cache - which uses per-superblock LRU lists. Hence we have related filesystem object types using different LRU reclaimation schemes. To enable a per-superblock filesystem cache shrinker, both of these caches need to have per-sb unused object LRU lists. Hence this patch converts the global inode LRU to per-sb LRUs. The patch only does rudimentary per-sb propotioning in the shrinker infrastructure, as this gets removed when the per-sb shrinker callouts are introduced later on. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index 263edeb9f0e9..e8e6dbfefe8c 100644 --- a/fs/super.c +++ b/fs/super.c @@ -77,6 +77,7 @@ static struct super_block *alloc_super(struct file_system_type *type) INIT_HLIST_BL_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); INIT_LIST_HEAD(&s->s_dentry_lru); + INIT_LIST_HEAD(&s->s_inode_lru); init_rwsem(&s->s_umount); mutex_init(&s->s_lock); lockdep_set_class(&s->s_umount, &type->s_umount_key); -- cgit v1.2.3 From 09cc9fc7a7c3d872065426d7fb0f0ad6d3eb90fc Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:40 +1000 Subject: inode: move to per-sb LRU locks With the inode LRUs moving to per-sb structures, there is no longer a need for a global inode_lru_lock. The locking can be made more fine-grained by moving to a per-sb LRU lock, isolating the LRU operations of different filesytsems completely from each other. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index e8e6dbfefe8c..73ab9f9b3571 100644 --- a/fs/super.c +++ b/fs/super.c @@ -78,6 +78,7 @@ static struct super_block *alloc_super(struct file_system_type *type) INIT_LIST_HEAD(&s->s_inodes); INIT_LIST_HEAD(&s->s_dentry_lru); INIT_LIST_HEAD(&s->s_inode_lru); + spin_lock_init(&s->s_inode_lru_lock); init_rwsem(&s->s_umount); mutex_init(&s->s_lock); lockdep_set_class(&s->s_umount, &type->s_umount_key); -- cgit v1.2.3 From 12ad3ab66103e6582ca69c0c9de18b13487eaaef Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:41 +1000 Subject: superblock: move pin_sb_for_writeback() to fs/super.c The per-sb shrinker has the same requirement as the writeback threads of ensuring that the superblock is usable and pinned for the time it takes to run the work. Both need to take a passive reference to the sb, take a read lock on the s_umount lock and then only continue if an unmount is not in progress. pin_sb_for_writeback() does this exactly, so move it to fs/super.c and rename it to grab_super_passive() and exporting it via fs/internal.h for all the VFS code to be able to use. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/super.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index 73ab9f9b3571..e63c754447ce 100644 --- a/fs/super.c +++ b/fs/super.c @@ -242,6 +242,39 @@ static int grab_super(struct super_block *s) __releases(sb_lock) return 0; } +/* + * grab_super_passive - acquire a passive reference + * @s: reference we are trying to grab + * + * Tries to acquire a passive reference. This is used in places where we + * cannot take an active reference but we need to ensure that the + * superblock does not go away while we are working on it. It returns + * false if a reference was not gained, and returns true with the s_umount + * lock held in read mode if a reference is gained. On successful return, + * the caller must drop the s_umount lock and the passive reference when + * done. + */ +bool grab_super_passive(struct super_block *sb) +{ + spin_lock(&sb_lock); + if (list_empty(&sb->s_instances)) { + spin_unlock(&sb_lock); + return false; + } + + sb->s_count++; + spin_unlock(&sb_lock); + + if (down_read_trylock(&sb->s_umount)) { + if (sb->s_root) + return true; + up_read(&sb->s_umount); + } + + put_super(sb); + return false; +} + /* * Superblock locking. We really ought to get rid of these two. */ -- cgit v1.2.3 From b0d40c92adafde7c2d81203ce7c1c69275f41140 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:42 +1000 Subject: superblock: introduce per-sb cache shrinker infrastructure With context based shrinkers, we can implement a per-superblock shrinker that shrinks the caches attached to the superblock. We currently have global shrinkers for the inode and dentry caches that split up into per-superblock operations via a coarse proportioning method that does not batch very well. The global shrinkers also have a dependency - dentries pin inodes - so we have to be very careful about how we register the global shrinkers so that the implicit call order is always correct. With a per-sb shrinker callout, we can encode this dependency directly into the per-sb shrinker, hence avoiding the need for strictly ordering shrinker registrations. We also have no need for any proportioning code for the shrinker subsystem already provides this functionality across all shrinkers. Allowing the shrinker to operate on a single superblock at a time means that we do less superblock list traversals and locking and reclaim should batch more effectively. This should result in less CPU overhead for reclaim and potentially faster reclaim of items from each filesystem. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/super.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index e63c754447ce..37a75410079e 100644 --- a/fs/super.c +++ b/fs/super.c @@ -38,6 +38,48 @@ LIST_HEAD(super_blocks); DEFINE_SPINLOCK(sb_lock); +/* + * One thing we have to be careful of with a per-sb shrinker is that we don't + * drop the last active reference to the superblock from within the shrinker. + * If that happens we could trigger unregistering the shrinker from within the + * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we + * take a passive reference to the superblock to avoid this from occurring. + */ +static int prune_super(struct shrinker *shrink, struct shrink_control *sc) +{ + struct super_block *sb; + int count; + + sb = container_of(shrink, struct super_block, s_shrink); + + /* + * Deadlock avoidance. We may hold various FS locks, and we don't want + * to recurse into the FS that called us in clear_inode() and friends.. + */ + if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS)) + return -1; + + if (!grab_super_passive(sb)) + return -1; + + if (sc->nr_to_scan) { + /* proportion the scan between the two caches */ + int total; + + total = sb->s_nr_dentry_unused + sb->s_nr_inodes_unused + 1; + count = (sc->nr_to_scan * sb->s_nr_dentry_unused) / total; + + /* prune dcache first as icache is pinned by it */ + prune_dcache_sb(sb, count); + prune_icache_sb(sb, sc->nr_to_scan - count); + } + + count = ((sb->s_nr_dentry_unused + sb->s_nr_inodes_unused) / 100) + * sysctl_vfs_cache_pressure; + drop_super(sb); + return count; +} + /** * alloc_super - create new superblock * @type: filesystem type superblock should belong to @@ -116,6 +158,9 @@ static struct super_block *alloc_super(struct file_system_type *type) s->s_op = &default_op; s->s_time_gran = 1000000000; s->cleancache_poolid = -1; + + s->s_shrink.seeks = DEFAULT_SEEKS; + s->s_shrink.shrink = prune_super; } out: return s; @@ -183,6 +228,10 @@ void deactivate_locked_super(struct super_block *s) if (atomic_dec_and_test(&s->s_active)) { cleancache_flush_fs(s); fs->kill_sb(s); + + /* caches are now gone, we can safely kill the shrinker now */ + unregister_shrinker(&s->s_shrink); + /* * We need to call rcu_barrier so all the delayed rcu free * inodes are flushed before we release the fs module. @@ -311,7 +360,6 @@ void generic_shutdown_super(struct super_block *sb) { const struct super_operations *sop = sb->s_op; - if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); @@ -399,6 +447,7 @@ retry: list_add(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); + register_shrinker(&s->s_shrink); return s; } -- cgit v1.2.3 From 0e1fdafd93980eac62e778798549ce0f6073905c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:44 +1000 Subject: superblock: add filesystem shrinker operations Now we have a per-superblock shrinker implementation, we can add a filesystem specific callout to it to allow filesystem internal caches to be shrunk by the superblock shrinker. Rather than perpetuate the multipurpose shrinker callback API (i.e. nr_to_scan == 0 meaning "tell me how many objects freeable in the cache), two operations will be added. The first will return the number of objects that are freeable, the second is the actual shrinker call. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/super.c | 45 +++++++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 12 deletions(-) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index 37a75410079e..5101f0544960 100644 --- a/fs/super.c +++ b/fs/super.c @@ -48,7 +48,8 @@ DEFINE_SPINLOCK(sb_lock); static int prune_super(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; - int count; + int fs_objects = 0; + int total_objects; sb = container_of(shrink, struct super_block, s_shrink); @@ -62,22 +63,42 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc) if (!grab_super_passive(sb)) return -1; - if (sc->nr_to_scan) { - /* proportion the scan between the two caches */ - int total; + if (sb->s_op && sb->s_op->nr_cached_objects) + fs_objects = sb->s_op->nr_cached_objects(sb); + + total_objects = sb->s_nr_dentry_unused + + sb->s_nr_inodes_unused + fs_objects + 1; - total = sb->s_nr_dentry_unused + sb->s_nr_inodes_unused + 1; - count = (sc->nr_to_scan * sb->s_nr_dentry_unused) / total; + if (sc->nr_to_scan) { + int dentries; + int inodes; + + /* proportion the scan between the caches */ + dentries = (sc->nr_to_scan * sb->s_nr_dentry_unused) / + total_objects; + inodes = (sc->nr_to_scan * sb->s_nr_inodes_unused) / + total_objects; + if (fs_objects) + fs_objects = (sc->nr_to_scan * fs_objects) / + total_objects; + /* + * prune the dcache first as the icache is pinned by it, then + * prune the icache, followed by the filesystem specific caches + */ + prune_dcache_sb(sb, dentries); + prune_icache_sb(sb, inodes); - /* prune dcache first as icache is pinned by it */ - prune_dcache_sb(sb, count); - prune_icache_sb(sb, sc->nr_to_scan - count); + if (fs_objects && sb->s_op->free_cached_objects) { + sb->s_op->free_cached_objects(sb, fs_objects); + fs_objects = sb->s_op->nr_cached_objects(sb); + } + total_objects = sb->s_nr_dentry_unused + + sb->s_nr_inodes_unused + fs_objects; } - count = ((sb->s_nr_dentry_unused + sb->s_nr_inodes_unused) / 100) - * sysctl_vfs_cache_pressure; + total_objects = (total_objects / 100) * sysctl_vfs_cache_pressure; drop_super(sb); - return count; + return total_objects; } /** -- cgit v1.2.3 From 8ab47664d51a69ea79fe70bb07ca80664f74f76b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 8 Jul 2011 14:14:45 +1000 Subject: vfs: increase shrinker batch size Now that the per-sb shrinker is responsible for shrinking 2 or more caches, increase the batch size to keep econmies of scale for shrinking each cache. Increase the shrinker batch size to 1024 objects. To allow for a large increase in batch size, add a conditional reschedule to prune_icache_sb() so that we don't hold the LRU spin lock for too long. This mirrors the behaviour of the __shrink_dcache_sb(), and allows us to increase the batch size without needing to worry about problems caused by long lock hold times. To ensure that filesystems using the per-sb shrinker callouts don't cause problems, document that the object freeing method must reschedule appropriately inside loops. Signed-off-by: Dave Chinner Signed-off-by: Al Viro --- fs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index 5101f0544960..7943f04cb3a9 100644 --- a/fs/super.c +++ b/fs/super.c @@ -182,6 +182,7 @@ static struct super_block *alloc_super(struct file_system_type *type) s->s_shrink.seeks = DEFAULT_SEEKS; s->s_shrink.shrink = prune_super; + s->s_shrink.batch = 1024; } out: return s; -- cgit v1.2.3 From 09f363c7363eb10cfb4b82094bd7064e5608258b Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 31 Oct 2011 17:08:57 -0700 Subject: vmscan: fix shrinker callback bug in fs/super.c The callback must not return -1 when nr_to_scan is zero. Fix the bug in fs/super.c and add this requirement to the callback specification. Signed-off-by: Mikulas Patocka Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index 3f56a269a4f4..32a81f3467e0 100644 --- a/fs/super.c +++ b/fs/super.c @@ -61,7 +61,7 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc) return -1; if (!grab_super_passive(sb)) - return -1; + return !sc->nr_to_scan ? 0 : -1; if (sb->s_op && sb->s_op->nr_cached_objects) fs_objects = sb->s_op->nr_cached_objects(sb); -- cgit v1.2.3 From 2833eb2b465a274d1a2529fed76c6d2904f8022b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 28 Oct 2011 14:13:20 +0200 Subject: vfs: ignore error on forced remount On emergency remount we want to force MS_RDONLY on the super block even if ->remount_fs() failed for some reason. Signed-off-by: Miklos Szeredi Tested-by: Toshiyuki Okajima Signed-off-by: Christoph Hellwig --- fs/super.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index 32a81f3467e0..afd0f1ad45e0 100644 --- a/fs/super.c +++ b/fs/super.c @@ -727,8 +727,13 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) if (sb->s_op->remount_fs) { retval = sb->s_op->remount_fs(sb, &flags, data); - if (retval) - return retval; + if (retval) { + if (!force) + return retval; + /* If forced remount, go ahead despite any errors */ + WARN(1, "forced remount of a %s fs returned %i\n", + sb->s_type->name, retval); + } } sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); -- cgit v1.2.3