summaryrefslogtreecommitdiffstats
path: root/fs/ext4/ext4.h
diff options
context:
space:
mode:
authorHarshad Shirwadkar <harshadshirwadkar@gmail.com>2021-04-01 10:21:27 -0700
committerTheodore Ts'o <tytso@mit.edu>2021-04-09 11:34:59 -0400
commit196e402adf2e4cd66f101923409f1970ec5f1af3 (patch)
tree28a178b4e874a5789fb208c7e4e1c3fc3b93a078 /fs/ext4/ext4.h
parent4b68f6df105966f04f45f1eca0561b86f2b3551d (diff)
downloadlinux-stable-196e402adf2e4cd66f101923409f1970ec5f1af3.tar.gz
linux-stable-196e402adf2e4cd66f101923409f1970ec5f1af3.tar.bz2
linux-stable-196e402adf2e4cd66f101923409f1970ec5f1af3.zip
ext4: improve cr 0 / cr 1 group scanning
Instead of traversing through groups linearly, scan groups in specific orders at cr 0 and cr 1. At cr 0, we want to find groups that have the largest free order >= the order of the request. So, with this patch, we maintain lists for each possible order and insert each group into a list based on the largest free order in its buddy bitmap. During cr 0 allocation, we traverse these lists in the increasing order of largest free orders. This allows us to find a group with the best available cr 0 match in constant time. If nothing can be found, we fallback to cr 1 immediately. At CR1, the story is slightly different. We want to traverse in the order of increasing average fragment size. For CR1, we maintain a rb tree of groupinfos which is sorted by average fragment size. Instead of traversing linearly, at CR1, we traverse in the order of increasing average fragment size, starting at the most optimal group. This brings down cr 1 search complexity to log(num groups). For cr >= 2, we just perform the linear search as before. Also, in case of lock contention, we intermittently fallback to linear search even in CR 0 and CR 1 cases. This allows us to proceed during the allocation path even in case of high contention. There is an opportunity to do optimization at CR2 too. That's because at CR2 we only consider groups where bb_free counter (number of free blocks) is greater than the request extent size. That's left as future work. All the changes introduced in this patch are protected under a new mount option "mb_optimize_scan". With this patchset, following experiment was performed: Created a highly fragmented disk of size 65TB. The disk had no contiguous 2M regions. Following command was run consecutively for 3 times: time dd if=/dev/urandom of=file bs=2M count=10 Here are the results with and without cr 0/1 optimizations introduced in this patch: |---------+------------------------------+---------------------------| | | Without CR 0/1 Optimizations | With CR 0/1 Optimizations | |---------+------------------------------+---------------------------| | 1st run | 5m1.871s | 2m47.642s | | 2nd run | 2m28.390s | 0m0.611s | | 3rd run | 2m26.530s | 0m1.255s | |---------+------------------------------+---------------------------| Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com> Reported-by: kernel test robot <lkp@intel.com> Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Reviewed-by: Andreas Dilger <adilger@dilger.ca> Link: https://lore.kernel.org/r/20210401172129.189766-6-harshadshirwadkar@gmail.com Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/ext4.h')
-rw-r--r--fs/ext4/ext4.h21
1 files changed, 19 insertions, 2 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 67a675d222f2..42df628b35e5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -162,7 +162,12 @@ enum SHIFT_DIRECTION {
#define EXT4_MB_USE_RESERVED 0x2000
/* Do strict check for free blocks while retrying block allocation */
#define EXT4_MB_STRICT_CHECK 0x4000
-
+/* Large fragment size list lookup succeeded at least once for cr = 0 */
+#define EXT4_MB_CR0_OPTIMIZED 0x8000
+/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
+#define EXT4_MB_CR1_OPTIMIZED 0x00010000
+/* Perform linear traversal for one group */
+#define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000
struct ext4_allocation_request {
/* target inode for block we're allocating */
struct inode *inode;
@@ -1247,7 +1252,9 @@ struct ext4_inode_info {
#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */
#define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */
#define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */
-
+#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group
+ * scanning in mballoc
+ */
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
@@ -1528,9 +1535,14 @@ struct ext4_sb_info {
unsigned int s_mb_free_pending;
struct list_head s_freed_data_list; /* List of blocks to be freed
after commit completed */
+ struct rb_root s_mb_avg_fragment_size_root;
+ rwlock_t s_mb_rb_lock;
+ struct list_head *s_mb_largest_free_orders;
+ rwlock_t *s_mb_largest_free_orders_locks;
/* tunables */
unsigned long s_stripe;
+ unsigned int s_mb_max_linear_groups;
unsigned int s_mb_stream_request;
unsigned int s_mb_max_to_scan;
unsigned int s_mb_min_to_scan;
@@ -1554,6 +1566,8 @@ struct ext4_sb_info {
atomic_t s_bal_goals; /* goal hits */
atomic_t s_bal_breaks; /* too long searches */
atomic_t s_bal_2orders; /* 2^order hits */
+ atomic_t s_bal_cr0_bad_suggestions;
+ atomic_t s_bal_cr1_bad_suggestions;
atomic64_t s_bal_cX_groups_considered[4];
atomic64_t s_bal_cX_hits[4];
atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */
@@ -3360,11 +3374,14 @@ struct ext4_group_info {
ext4_grpblk_t bb_free; /* total free blocks */
ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
+ ext4_group_t bb_group; /* Group number */
struct list_head bb_prealloc_list;
#ifdef DOUBLE_CHECK
void *bb_bitmap;
#endif
struct rw_semaphore alloc_sem;
+ struct rb_node bb_avg_fragment_size_rb;
+ struct list_head bb_largest_free_order_node;
ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
* regions, index is order.
* bb_counters[3] = 5 means