// SPDX-License-Identifier: GPL-2.0-or-later /* * Debugging module statistics. * * Copyright (C) 2023 Luis Chamberlain */ #include #include #include #include #include #include #include #include #include #include "internal.h" /** * DOC: module debugging statistics overview * * Enabling CONFIG_MODULE_STATS enables module debugging statistics which * are useful to monitor and root cause memory pressure issues with module * loading. These statistics are useful to allow us to improve production * workloads. * * The current module debugging statistics supported help keep track of module * loading failures to enable improvements either for kernel module auto-loading * usage (request_module()) or interactions with userspace. Statistics are * provided to track all possible failures in the finit_module() path and memory * wasted in this process space. Each of the failure counters are associated * to a type of module loading failure which is known to incur a certain amount * of memory allocation loss. In the worst case loading a module will fail after * a 3 step memory allocation process: * * a) memory allocated with kernel_read_file_from_fd() * b) module decompression processes the file read from * kernel_read_file_from_fd(), and vmap() is used to map * the decompressed module to a new local buffer which represents * a copy of the decompressed module passed from userspace. The buffer * from kernel_read_file_from_fd() is freed right away. * c) layout_and_allocate() allocates space for the final resting * place where we would keep the module if it were to be processed * successfully. * * If a failure occurs after these three different allocations only one * counter will be incremented with the summation of the allocated bytes freed * incurred during this failure. Likewise, if module loading failed only after * step b) a separate counter is used and incremented for the bytes freed and * not used during both of those allocations. * * Virtual memory space can be limited, for example on x86 virtual memory size * defaults to 128 MiB. We should strive to limit and avoid wasting virtual * memory allocations when possible. These module debugging statistics help * to evaluate how much memory is being wasted on bootup due to module loading * failures. * * All counters are designed to be incremental. Atomic counters are used so to * remain simple and avoid delays and deadlocks. */ /** * DOC: dup_failed_modules - tracks duplicate failed modules * * Linked list of modules which failed to be loaded because an already existing * module with the same name was already being processed or already loaded. * The finit_module() system call incurs heavy virtual memory allocations. In * the worst case an finit_module() system call can end up allocating virtual * memory 3 times: * * 1) kernel_read_file_from_fd() call uses vmalloc() * 2) optional module decompression uses vmap() * 3) layout_and allocate() can use vzalloc() or an arch specific variation of * vmalloc to deal with ELF sections requiring special permissions * * In practice on a typical boot today most finit_module() calls fail due to * the module with the same name already being loaded or about to be processed. * All virtual memory allocated to these failed modules will be freed with * no functional use. * * To help with this the dup_failed_modules allows us to track modules which * failed to load due to the fact that a module was already loaded or being * processed. There are only two points at which we can fail such calls, * we list them below along with the number of virtual memory allocation * calls: * * a) FAIL_DUP_MOD_BECOMING: at the end of early_mod_check() before * layout_and_allocate(). * - with module decompression: 2 virtual memory allocation calls * - without module decompression: 1 virtual memory allocation calls * b) FAIL_DUP_MOD_LOAD: after layout_and_allocate() on add_unformed_module() * - with module decompression 3 virtual memory allocation calls * - without module decompression 2 virtual memory allocation calls * * We should strive to get this list to be as small as possible. If this list * is not empty it is a reflection of possible work or optimizations possible * either in-kernel or in userspace. */ static LIST_HEAD(dup_failed_modules); /** * DOC: module statistics debugfs counters * * The total amount of wasted virtual memory allocation space during module * loading can be computed by adding the total from the summation: * * * @invalid_kread_bytes + * @invalid_decompress_bytes + * @invalid_becoming_bytes + * @invalid_mod_bytes * * The following debugfs counters are available to inspect module loading * failures: * * * total_mod_size: total bytes ever used by all modules we've dealt with on * this system * * total_text_size: total bytes of the .text and .init.text ELF section * sizes we've dealt with on this system * * invalid_kread_bytes: bytes allocated and then freed on failures which * happen due to the initial kernel_read_file_from_fd(). kernel_read_file_from_fd() * uses vmalloc(). These should typically not happen unless your system is * under memory pressure. * * invalid_decompress_bytes: number of bytes allocated and freed due to * memory allocations in the module decompression path that use vmap(). * These typically should not happen unless your system is under memory * pressure. * * invalid_becoming_bytes: total number of bytes allocated and freed used * to read the kernel module userspace wants us to read before we * promote it to be processed to be added to our @modules linked list. These * failures can happen if we had a check in between a successful kernel_read_file_from_fd() * call and right before we allocate the our private memory for the module * which would be kept if the module is successfully loaded. The most common * reason for this failure is when userspace is racing to load a module * which it does not yet see loaded. The first module to succeed in * add_unformed_module() will add a module to our &modules list and * subsequent loads of modules with the same name will error out at the * end of early_mod_check(). The check for module_patient_check_exists() * at the end of early_mod_check() prevents duplicate allocations * on layout_and_allocate() for modules already being processed. These * duplicate failed modules are non-fatal, however they typically are * indicative of userspace not seeing a module in userspace loaded yet and * unnecessarily trying to load a module before the kernel even has a chance * to begin to process prior requests. Although duplicate failures can be * non-fatal, we should try to reduce vmalloc() pressure proactively, so * ideally after boot this will be close to as 0 as possible. If module * decompression was used we also add to this counter the cost of the * initial kernel_read_file_from_fd() of the compressed module. If module * decompression was not used the value represents the total allocated and * freed bytes in kernel_read_file_from_fd() calls for these type of * failures. These failures can occur because: * * * module_sig_check() - module signature checks * * elf_validity_cache_copy() - some ELF validation issue * * early_mod_check(): * * * blacklisting * * failed to rewrite section headers * * version magic * * live patch requirements didn't check out * * the module was detected as being already present * * * invalid_mod_bytes: these are the total number of bytes allocated and * freed due to failures after we did all the sanity checks of the module * which userspace passed to us and after our first check that the module * is unique. A module can still fail to load if we detect the module is * loaded after we allocate space for it with layout_and_allocate(), we do * this check right before processing the module as live and run its * initialization routines. Note that you have a failure of this type it * also means the respective kernel_read_file_from_fd() memory space was * also freed and not used, and so we increment this counter with twice * the size of the module. Additionally if you used module decompression * the size of the compressed module is also added to this counter. * * * modcount: how many modules we've loaded in our kernel life time * * failed_kreads: how many modules failed due to failed kernel_read_file_from_fd() * * failed_decompress: how many failed module decompression attempts we've had. * These really should not happen unless your compression / decompression * might be broken. * * failed_becoming: how many modules failed after we kernel_read_file_from_fd() * it and before we allocate memory for it with layout_and_allocate(). This * counter is never incremented if you manage to validate the module and * call layout_and_allocate() for it. * * failed_load_modules: how many modules failed once we've allocated our * private space for our module using layout_and_allocate(). These failures * should hopefully mostly be dealt with already. Races in theory could * still exist here, but it would just mean the kernel had started processing * two threads concurrently up to early_mod_check() and one thread won. * These failures are good signs the kernel or userspace is doing something * seriously stupid or that could be improved. We should strive to fix these, * but it is perhaps not easy to fix them. A recent example are the modules * requests incurred for frequency modules, a separate module request was * being issued for each CPU on a system. */ atomic_long_t total_mod_size; atomic_long_t total_text_size; atomic_long_t invalid_kread_bytes; atomic_long_t invalid_decompress_bytes; static atomic_long_t invalid_becoming_bytes; static atomic_long_t invalid_mod_bytes; atomic_t modcount; atomic_t failed_kreads; atomic_t failed_decompress; static atomic_t failed_becoming; static atomic_t failed_load_modules; static const char *mod_fail_to_str(struct mod_fail_load *mod_fail) { if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask) && test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask)) return "Becoming & Load"; if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask)) return "Becoming"; if (test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask)) return "Load"; return "Bug-on-stats"; } void mod_stat_bump_invalid(struct load_info *info, int flags) { atomic_long_add(info->len * 2, &invalid_mod_bytes); atomic_inc(&failed_load_modules); #if defined(CONFIG_MODULE_DECOMPRESS) if (flags & MODULE_INIT_COMPRESSED_FILE) atomic_long_add(info->compressed_len, &invalid_mod_bytes); #endif } void mod_stat_bump_becoming(struct load_info *info, int flags) { atomic_inc(&failed_becoming); atomic_long_add(info->len, &invalid_becoming_bytes); #if defined(CONFIG_MODULE_DECOMPRESS) if (flags & MODULE_INIT_COMPRESSED_FILE) atomic_long_add(info->compressed_len, &invalid_becoming_bytes); #endif } int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason) { struct mod_fail_load *mod_fail; list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list, lockdep_is_held(&module_mutex)) { if (!strcmp(mod_fail->name, name)) { atomic_long_inc(&mod_fail->count); __set_bit(reason, &mod_fail->dup_fail_mask); goto out; } } mod_fail = kzalloc(sizeof(*mod_fail), GFP_KERNEL); if (!mod_fail) return -ENOMEM; memcpy(mod_fail->name, name, strlen(name)); __set_bit(reason, &mod_fail->dup_fail_mask); atomic_long_inc(&mod_fail->count); list_add_rcu(&mod_fail->list, &dup_failed_modules); out: return 0; } /* * At 64 bytes per module and assuming a 1024 bytes preamble we can fit the * 112 module prints within 8k. * * 1024 + (64*112) = 8k */ #define MAX_PREAMBLE 1024 #define MAX_FAILED_MOD_PRINT 112 #define MAX_BYTES_PER_MOD 64 static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct mod_fail_load *mod_fail; unsigned int len, size, count_failed = 0; char *buf; int ret; u32 live_mod_count, fkreads, fdecompress, fbecoming, floads; unsigned long total_size, text_size, ikread_bytes, ibecoming_bytes, idecompress_bytes, imod_bytes, total_virtual_lost; live_mod_count = atomic_read(&modcount); fkreads = atomic_read(&failed_kreads); fdecompress = atomic_read(&failed_decompress); fbecoming = atomic_read(&failed_becoming); floads = atomic_read(&failed_load_modules); total_size = atomic_long_read(&total_mod_size); text_size = atomic_long_read(&total_text_size); ikread_bytes = atomic_long_read(&invalid_kread_bytes); idecompress_bytes = atomic_long_read(&invalid_decompress_bytes); ibecoming_bytes = atomic_long_read(&invalid_becoming_bytes); imod_bytes = atomic_long_read(&invalid_mod_bytes); total_virtual_lost = ikread_bytes + idecompress_bytes + ibecoming_bytes + imod_bytes; size = MAX_PREAMBLE + min((unsigned int)(floads + fbecoming), (unsigned int)MAX_FAILED_MOD_PRINT) * MAX_BYTES_PER_MOD; buf = kzalloc(size, GFP_KERNEL); if (buf == NULL) return -ENOMEM; /* The beginning of our debug preamble */ len = scnprintf(buf, size, "%25s\t%u\n", "Mods ever loaded", live_mod_count); len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on kread", fkreads); len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on decompress", fdecompress); len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on becoming", fbecoming); len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on load", floads); len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total module size", total_size); len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total mod text size", text_size); len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kread bytes", ikread_bytes); len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed decompress bytes", idecompress_bytes); len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed becoming bytes", ibecoming_bytes); len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kmod bytes", imod_bytes); len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Virtual mem wasted bytes", total_virtual_lost); if (live_mod_count && total_size) { len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod size", DIV_ROUND_UP(total_size, live_mod_count)); } if (live_mod_count && text_size) { len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod text size", DIV_ROUND_UP(text_size, live_mod_count)); } /* * We use WARN_ON_ONCE() for the counters to ensure we always have parity * for keeping tabs on a type of failure with one type of byte counter. * The counters for imod_bytes does not increase for fkreads failures * for example, and so on. */ WARN_ON_ONCE(ikread_bytes && !fkreads); if (fkreads && ikread_bytes) { len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail kread bytes", DIV_ROUND_UP(ikread_bytes, fkreads)); } WARN_ON_ONCE(ibecoming_bytes && !fbecoming); if (fbecoming && ibecoming_bytes) { len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail becoming bytes", DIV_ROUND_UP(ibecoming_bytes, fbecoming)); } WARN_ON_ONCE(idecompress_bytes && !fdecompress); if (fdecompress && idecompress_bytes) { len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail decomp bytes", DIV_ROUND_UP(idecompress_bytes, fdecompress)); } WARN_ON_ONCE(imod_bytes && !floads); if (floads && imod_bytes) { len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average fail load bytes", DIV_ROUND_UP(imod_bytes, floads)); } /* End of our debug preamble header. */ /* Catch when we've gone beyond our expected preamble */ WARN_ON_ONCE(len >= MAX_PREAMBLE); if (list_empty(&dup_failed_modules)) goto out; len += scnprintf(buf + len, size - len, "Duplicate failed modules:\n"); len += scnprintf(buf + len, size - len, "%25s\t%15s\t%25s\n", "Module-name", "How-many-times", "Reason"); mutex_lock(&module_mutex); list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list) { if (WARN_ON_ONCE(++count_failed >= MAX_FAILED_MOD_PRINT)) goto out_unlock; len += scnprintf(buf + len, size - len, "%25s\t%15lu\t%25s\n", mod_fail->name, atomic_long_read(&mod_fail->count), mod_fail_to_str(mod_fail)); } out_unlock: mutex_unlock(&module_mutex); out: ret = simple_read_from_buffer(user_buf, count, ppos, buf, len); kfree(buf); return ret; } #undef MAX_PREAMBLE #undef MAX_FAILED_MOD_PRINT #undef MAX_BYTES_PER_MOD static const struct file_operations fops_mod_stats = { .read = read_file_mod_stats, .open = simple_open, .owner = THIS_MODULE, .llseek = default_llseek, }; #define mod_debug_add_ulong(name) debugfs_create_ulong(#name, 0400, mod_debugfs_root, (unsigned long *) &name.counter) #define mod_debug_add_atomic(name) debugfs_create_atomic_t(#name, 0400, mod_debugfs_root, &name) static int __init module_stats_init(void) { mod_debug_add_ulong(total_mod_size); mod_debug_add_ulong(total_text_size); mod_debug_add_ulong(invalid_kread_bytes); mod_debug_add_ulong(invalid_decompress_bytes); mod_debug_add_ulong(invalid_becoming_bytes); mod_debug_add_ulong(invalid_mod_bytes); mod_debug_add_atomic(modcount); mod_debug_add_atomic(failed_kreads); mod_debug_add_atomic(failed_decompress); mod_debug_add_atomic(failed_becoming); mod_debug_add_atomic(failed_load_modules); debugfs_create_file("stats", 0400, mod_debugfs_root, mod_debugfs_root, &fops_mod_stats); return 0; } #undef mod_debug_add_ulong #undef mod_debug_add_atomic module_init(module_stats_init);