diff options
author | Theodore Ts'o <tytso@mit.edu> | 2015-02-02 00:37:00 -0500 |
---|---|---|
committer | Al Viro <viro@zeniv.linux.org.uk> | 2015-02-05 02:45:00 -0500 |
commit | 0ae45f63d4ef8d8eeec49c7d8b44a1775fff13e8 (patch) | |
tree | 660dbb014482092361eab263847fb906b5a9ec22 /fs/inode.c | |
parent | e36f014edff70fc02b3d3d79cead1d58f289332e (diff) | |
download | linux-0ae45f63d4ef8d8eeec49c7d8b44a1775fff13e8.tar.gz linux-0ae45f63d4ef8d8eeec49c7d8b44a1775fff13e8.tar.bz2 linux-0ae45f63d4ef8d8eeec49c7d8b44a1775fff13e8.zip |
vfs: add support for a lazytime mount option
Add a new mount option which enables a new "lazytime" mode. This mode
causes atime, mtime, and ctime updates to only be made to the
in-memory version of the inode. The on-disk times will only get
updated when (a) if the inode needs to be updated for some non-time
related change, (b) if userspace calls fsync(), syncfs() or sync(), or
(c) just before an undeleted inode is evicted from memory.
This is OK according to POSIX because there are no guarantees after a
crash unless userspace explicitly requests via a fsync(2) call.
For workloads which feature a large number of random write to a
preallocated file, the lazytime mount option significantly reduces
writes to the inode table. The repeated 4k writes to a single block
will result in undesirable stress on flash devices and SMR disk
drives. Even on conventional HDD's, the repeated writes to the inode
table block will trigger Adjacent Track Interference (ATI) remediation
latencies, which very negatively impact long tail latencies --- which
is a very big deal for web serving tiers (for example).
Google-Bug-Id: 18297052
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'fs/inode.c')
-rw-r--r-- | fs/inode.c | 56 |
1 files changed, 40 insertions, 16 deletions
diff --git a/fs/inode.c b/fs/inode.c index aa149e7262ac..4feb85cc125f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -18,6 +18,7 @@ #include <linux/buffer_head.h> /* for inode_has_buffers */ #include <linux/ratelimit.h> #include <linux/list_lru.h> +#include <trace/events/writeback.h> #include "internal.h" /* @@ -30,7 +31,7 @@ * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list * bdi->wb.list_lock protects: - * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list + * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash * @@ -416,7 +417,8 @@ static void inode_lru_list_add(struct inode *inode) */ void inode_add_lru(struct inode *inode) { - if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) && + if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC | + I_FREEING | I_WILL_FREE)) && !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE) inode_lru_list_add(inode); } @@ -647,7 +649,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) spin_unlock(&inode->i_lock); continue; } - if (inode->i_state & I_DIRTY && !kill_dirty) { + if (inode->i_state & I_DIRTY_ALL && !kill_dirty) { spin_unlock(&inode->i_lock); busy = 1; continue; @@ -1432,11 +1434,20 @@ static void iput_final(struct inode *inode) */ void iput(struct inode *inode) { - if (inode) { - BUG_ON(inode->i_state & I_CLEAR); - - if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) - iput_final(inode); + if (!inode) + return; + BUG_ON(inode->i_state & I_CLEAR); +retry: + if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { + if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { + atomic_inc(&inode->i_count); + inode->i_state &= ~I_DIRTY_TIME; + spin_unlock(&inode->i_lock); + trace_writeback_lazytime_iput(inode); + mark_inode_dirty_sync(inode); + goto retry; + } + iput_final(inode); } } EXPORT_SYMBOL(iput); @@ -1495,14 +1506,9 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, return 0; } -/* - * This does the actual work of updating an inodes time or version. Must have - * had called mnt_want_write() before calling this. - */ -static int update_time(struct inode *inode, struct timespec *time, int flags) +int generic_update_time(struct inode *inode, struct timespec *time, int flags) { - if (inode->i_op->update_time) - return inode->i_op->update_time(inode, time, flags); + int iflags = I_DIRTY_TIME; if (flags & S_ATIME) inode->i_atime = *time; @@ -1512,9 +1518,27 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) inode->i_ctime = *time; if (flags & S_MTIME) inode->i_mtime = *time; - mark_inode_dirty_sync(inode); + + if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION)) + iflags |= I_DIRTY_SYNC; + __mark_inode_dirty(inode, iflags); return 0; } +EXPORT_SYMBOL(generic_update_time); + +/* + * This does the actual work of updating an inodes time or version. Must have + * had called mnt_want_write() before calling this. + */ +static int update_time(struct inode *inode, struct timespec *time, int flags) +{ + int (*update_time)(struct inode *, struct timespec *, int); + + update_time = inode->i_op->update_time ? inode->i_op->update_time : + generic_update_time; + + return update_time(inode, time, flags); +} /** * touch_atime - update the access time |