diff options
author | brookxu <brookxu.cn@gmail.com> | 2020-08-17 15:36:15 +0800 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2020-08-19 12:04:36 -0400 |
commit | 27bc446e2def38db3244a6eb4bb1d6312936610a (patch) | |
tree | 567a878460844a91b28ae1a70221f2e3c8e106ee /fs/ext4/mballoc.c | |
parent | 66d5e0277e225cdc5d272fc22b1aa90a9b0d21ac (diff) | |
download | linux-27bc446e2def38db3244a6eb4bb1d6312936610a.tar.gz linux-27bc446e2def38db3244a6eb4bb1d6312936610a.tar.bz2 linux-27bc446e2def38db3244a6eb4bb1d6312936610a.zip |
ext4: limit the length of per-inode prealloc list
In the scenario of writing sparse files, the per-inode prealloc list may
be very long, resulting in high overhead for ext4_mb_use_preallocated().
To circumvent this problem, we limit the maximum length of per-inode
prealloc list to 512 and allow users to modify it.
After patching, we observed that the sys ratio of cpu has dropped, and
the system throughput has increased significantly. We created a process
to write the sparse file, and the running time of the process on the
fixed kernel was significantly reduced, as follows:
Running time on unfixed kernel:
[root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat
real 0m2.051s
user 0m0.008s
sys 0m2.026s
Running time on fixed kernel:
[root@TENCENT64 ~]# time taskset 0x01 ./sparse /data1/sparce.dat
real 0m0.471s
user 0m0.004s
sys 0m0.395s
Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Link: https://lore.kernel.org/r/d7a98178-056b-6db5-6bce-4ead23f4a257@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/mballoc.c')
-rw-r--r-- | fs/ext4/mballoc.c | 74 |
1 files changed, 66 insertions, 8 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 45ac6088b4ac..132c118d12e1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2878,6 +2878,7 @@ int ext4_mb_init(struct super_block *sb) sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC; /* * The default group preallocation is 512, which for 4k block * sizes translates to 2 megabytes. However for bigalloc file @@ -3816,6 +3817,26 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); } +static void ext4_mb_mark_pa_deleted(struct super_block *sb, + struct ext4_prealloc_space *pa) +{ + struct ext4_inode_info *ei; + + if (pa->pa_deleted) { + ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n", + pa->pa_type, pa->pa_pstart, pa->pa_lstart, + pa->pa_len); + return; + } + + pa->pa_deleted = 1; + + if (pa->pa_type == MB_INODE_PA) { + ei = EXT4_I(pa->pa_inode); + atomic_dec(&ei->i_prealloc_active); + } +} + static void ext4_mb_pa_callback(struct rcu_head *head) { struct ext4_prealloc_space *pa; @@ -3848,7 +3869,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, return; } - pa->pa_deleted = 1; + ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); grp_blk = pa->pa_pstart; @@ -3972,6 +3993,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) spin_lock(pa->pa_obj_lock); list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); spin_unlock(pa->pa_obj_lock); + atomic_inc(&ei->i_prealloc_active); } /* @@ -4182,7 +4204,7 @@ repeat: } /* seems this one can be freed ... */ - pa->pa_deleted = 1; + ext4_mb_mark_pa_deleted(sb, pa); /* we can trust pa_free ... */ free += pa->pa_free; @@ -4245,7 +4267,7 @@ out_dbg: * * FIXME!! Make sure it is valid at all the call sites */ -void ext4_discard_preallocations(struct inode *inode) +void ext4_discard_preallocations(struct inode *inode, unsigned int needed) { struct ext4_inode_info *ei = EXT4_I(inode); struct super_block *sb = inode->i_sb; @@ -4263,15 +4285,19 @@ void ext4_discard_preallocations(struct inode *inode) mb_debug(sb, "discard preallocation for inode %lu\n", inode->i_ino); - trace_ext4_discard_preallocations(inode); + trace_ext4_discard_preallocations(inode, + atomic_read(&ei->i_prealloc_active), needed); INIT_LIST_HEAD(&list); + if (needed == 0) + needed = UINT_MAX; + repeat: /* first, collect all pa's in the inode */ spin_lock(&ei->i_prealloc_lock); - while (!list_empty(&ei->i_prealloc_list)) { - pa = list_entry(ei->i_prealloc_list.next, + while (!list_empty(&ei->i_prealloc_list) && needed) { + pa = list_entry(ei->i_prealloc_list.prev, struct ext4_prealloc_space, pa_inode_list); BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); spin_lock(&pa->pa_lock); @@ -4288,10 +4314,11 @@ repeat: } if (pa->pa_deleted == 0) { - pa->pa_deleted = 1; + ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); list_del_rcu(&pa->pa_inode_list); list_add(&pa->u.pa_tmp_list, &list); + needed--; continue; } @@ -4592,7 +4619,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, BUG_ON(pa->pa_type != MB_GROUP_PA); /* seems this one can be freed ... */ - pa->pa_deleted = 1; + ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); list_del_rcu(&pa->pa_inode_list); @@ -4691,10 +4718,29 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) } /* + * if per-inode prealloc list is too long, trim some PA + */ +static void ext4_mb_trim_inode_pa(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int count, delta; + + count = atomic_read(&ei->i_prealloc_active); + delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1; + if (count > sbi->s_mb_max_inode_prealloc + delta) { + count -= sbi->s_mb_max_inode_prealloc; + ext4_discard_preallocations(inode, count); + } +} + +/* * release all resource we used in allocation */ static int ext4_mb_release_context(struct ext4_allocation_context *ac) { + struct inode *inode = ac->ac_inode; + struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { @@ -4720,6 +4766,17 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) ext4_mb_add_n_trim(ac); } } + + if (pa->pa_type == MB_INODE_PA) { + /* + * treat per-inode prealloc list as a lru list, then try + * to trim the least recently used PA. + */ + spin_lock(pa->pa_obj_lock); + list_move(&pa->pa_inode_list, &ei->i_prealloc_list); + spin_unlock(pa->pa_obj_lock); + } + ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->ac_bitmap_page) @@ -4729,6 +4786,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); + ext4_mb_trim_inode_pa(inode); return 0; } |