// SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2023 Red Hat */ #include #include #include #include #include #include #include #include #include #include "admin-state.h" #include "block-map.h" #include "completion.h" #include "constants.h" #include "data-vio.h" #include "dedupe.h" #include "dump.h" #include "encodings.h" #include "errors.h" #include "flush.h" #include "io-submitter.h" #include "logger.h" #include "memory-alloc.h" #include "message-stats.h" #include "recovery-journal.h" #include "repair.h" #include "slab-depot.h" #include "status-codes.h" #include "string-utils.h" #include "thread-device.h" #include "thread-registry.h" #include "thread-utils.h" #include "types.h" #include "vdo.h" #include "vio.h" enum admin_phases { GROW_LOGICAL_PHASE_START, GROW_LOGICAL_PHASE_GROW_BLOCK_MAP, GROW_LOGICAL_PHASE_END, GROW_LOGICAL_PHASE_ERROR, GROW_PHYSICAL_PHASE_START, GROW_PHYSICAL_PHASE_COPY_SUMMARY, GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS, GROW_PHYSICAL_PHASE_USE_NEW_SLABS, GROW_PHYSICAL_PHASE_END, GROW_PHYSICAL_PHASE_ERROR, LOAD_PHASE_START, LOAD_PHASE_LOAD_DEPOT, LOAD_PHASE_MAKE_DIRTY, LOAD_PHASE_PREPARE_TO_ALLOCATE, LOAD_PHASE_SCRUB_SLABS, LOAD_PHASE_DATA_REDUCTION, LOAD_PHASE_FINISHED, LOAD_PHASE_DRAIN_JOURNAL, LOAD_PHASE_WAIT_FOR_READ_ONLY, PRE_LOAD_PHASE_START, PRE_LOAD_PHASE_LOAD_COMPONENTS, PRE_LOAD_PHASE_END, PREPARE_GROW_PHYSICAL_PHASE_START, RESUME_PHASE_START, RESUME_PHASE_ALLOW_READ_ONLY_MODE, RESUME_PHASE_DEDUPE, RESUME_PHASE_DEPOT, RESUME_PHASE_JOURNAL, RESUME_PHASE_BLOCK_MAP, RESUME_PHASE_LOGICAL_ZONES, RESUME_PHASE_PACKER, RESUME_PHASE_FLUSHER, RESUME_PHASE_DATA_VIOS, RESUME_PHASE_END, SUSPEND_PHASE_START, SUSPEND_PHASE_PACKER, SUSPEND_PHASE_DATA_VIOS, SUSPEND_PHASE_DEDUPE, SUSPEND_PHASE_FLUSHES, SUSPEND_PHASE_LOGICAL_ZONES, SUSPEND_PHASE_BLOCK_MAP, SUSPEND_PHASE_JOURNAL, SUSPEND_PHASE_DEPOT, SUSPEND_PHASE_READ_ONLY_WAIT, SUSPEND_PHASE_WRITE_SUPER_BLOCK, SUSPEND_PHASE_END, }; static const char * const ADMIN_PHASE_NAMES[] = { "GROW_LOGICAL_PHASE_START", "GROW_LOGICAL_PHASE_GROW_BLOCK_MAP", "GROW_LOGICAL_PHASE_END", "GROW_LOGICAL_PHASE_ERROR", "GROW_PHYSICAL_PHASE_START", "GROW_PHYSICAL_PHASE_COPY_SUMMARY", "GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS", "GROW_PHYSICAL_PHASE_USE_NEW_SLABS", "GROW_PHYSICAL_PHASE_END", "GROW_PHYSICAL_PHASE_ERROR", "LOAD_PHASE_START", "LOAD_PHASE_LOAD_DEPOT", "LOAD_PHASE_MAKE_DIRTY", "LOAD_PHASE_PREPARE_TO_ALLOCATE", "LOAD_PHASE_SCRUB_SLABS", "LOAD_PHASE_DATA_REDUCTION", "LOAD_PHASE_FINISHED", "LOAD_PHASE_DRAIN_JOURNAL", "LOAD_PHASE_WAIT_FOR_READ_ONLY", "PRE_LOAD_PHASE_START", "PRE_LOAD_PHASE_LOAD_COMPONENTS", "PRE_LOAD_PHASE_END", "PREPARE_GROW_PHYSICAL_PHASE_START", "RESUME_PHASE_START", "RESUME_PHASE_ALLOW_READ_ONLY_MODE", "RESUME_PHASE_DEDUPE", "RESUME_PHASE_DEPOT", "RESUME_PHASE_JOURNAL", "RESUME_PHASE_BLOCK_MAP", "RESUME_PHASE_LOGICAL_ZONES", "RESUME_PHASE_PACKER", "RESUME_PHASE_FLUSHER", "RESUME_PHASE_DATA_VIOS", "RESUME_PHASE_END", "SUSPEND_PHASE_START", "SUSPEND_PHASE_PACKER", "SUSPEND_PHASE_DATA_VIOS", "SUSPEND_PHASE_DEDUPE", "SUSPEND_PHASE_FLUSHES", "SUSPEND_PHASE_LOGICAL_ZONES", "SUSPEND_PHASE_BLOCK_MAP", "SUSPEND_PHASE_JOURNAL", "SUSPEND_PHASE_DEPOT", "SUSPEND_PHASE_READ_ONLY_WAIT", "SUSPEND_PHASE_WRITE_SUPER_BLOCK", "SUSPEND_PHASE_END", }; /* If we bump this, update the arrays below */ #define TABLE_VERSION 4 /* arrays for handling different table versions */ static const u8 REQUIRED_ARGC[] = { 10, 12, 9, 7, 6 }; /* pool name no longer used. only here for verification of older versions */ static const u8 POOL_NAME_ARG_INDEX[] = { 8, 10, 8 }; /* * Track in-use instance numbers using a flat bit array. * * O(n) run time isn't ideal, but if we have 1000 VDO devices in use simultaneously we still only * need to scan 16 words, so it's not likely to be a big deal compared to other resource usage. */ /* * This minimum size for the bit array creates a numbering space of 0-999, which allows * successive starts of the same volume to have different instance numbers in any * reasonably-sized test. Changing instances on restart allows vdoMonReport to detect that * the ephemeral stats have reset to zero. */ #define BIT_COUNT_MINIMUM 1000 /* Grow the bit array by this many bits when needed */ #define BIT_COUNT_INCREMENT 100 struct instance_tracker { unsigned int bit_count; unsigned long *words; unsigned int count; unsigned int next; }; static DEFINE_MUTEX(instances_lock); static struct instance_tracker instances; /** * free_device_config() - Free a device config created by parse_device_config(). * @config: The config to free. */ static void free_device_config(struct device_config *config) { if (config == NULL) return; if (config->owned_device != NULL) dm_put_device(config->owning_target, config->owned_device); vdo_free(config->parent_device_name); vdo_free(config->original_string); /* Reduce the chance a use-after-free (as in BZ 1669960) happens to work. */ memset(config, 0, sizeof(*config)); vdo_free(config); } /** * get_version_number() - Decide the version number from argv. * * @argc: The number of table values. * @argv: The array of table values. * @error_ptr: A pointer to return a error string in. * @version_ptr: A pointer to return the version. * * Return: VDO_SUCCESS or an error code. */ static int get_version_number(int argc, char **argv, char **error_ptr, unsigned int *version_ptr) { /* version, if it exists, is in a form of V */ if (sscanf(argv[0], "V%u", version_ptr) == 1) { if (*version_ptr < 1 || *version_ptr > TABLE_VERSION) { *error_ptr = "Unknown version number detected"; return VDO_BAD_CONFIGURATION; } } else { /* V0 actually has no version number in the table string */ *version_ptr = 0; } /* * V0 and V1 have no optional parameters. There will always be a parameter for thread * config, even if it's a "." to show it's an empty list. */ if (*version_ptr <= 1) { if (argc != REQUIRED_ARGC[*version_ptr]) { *error_ptr = "Incorrect number of arguments for version"; return VDO_BAD_CONFIGURATION; } } else if (argc < REQUIRED_ARGC[*version_ptr]) { *error_ptr = "Incorrect number of arguments for version"; return VDO_BAD_CONFIGURATION; } if (*version_ptr != TABLE_VERSION) { vdo_log_warning("Detected version mismatch between kernel module and tools kernel: %d, tool: %d", TABLE_VERSION, *version_ptr); vdo_log_warning("Please consider upgrading management tools to match kernel."); } return VDO_SUCCESS; } /* Free a list of non-NULL string pointers, and then the list itself. */ static void free_string_array(char **string_array) { unsigned int offset; for (offset = 0; string_array[offset] != NULL; offset++) vdo_free(string_array[offset]); vdo_free(string_array); } /* * Split the input string into substrings, separated at occurrences of the indicated character, * returning a null-terminated list of string pointers. * * The string pointers and the pointer array itself should both be freed with vdo_free() when no * longer needed. This can be done with vdo_free_string_array (below) if the pointers in the array * are not changed. Since the array and copied strings are allocated by this function, it may only * be used in contexts where allocation is permitted. * * Empty substrings are not ignored; that is, returned substrings may be empty strings if the * separator occurs twice in a row. */ static int split_string(const char *string, char separator, char ***substring_array_ptr) { unsigned int current_substring = 0, substring_count = 1; const char *s; char **substrings; int result; ptrdiff_t length; for (s = string; *s != 0; s++) { if (*s == separator) substring_count++; } result = vdo_allocate(substring_count + 1, char *, "string-splitting array", &substrings); if (result != VDO_SUCCESS) return result; for (s = string; *s != 0; s++) { if (*s == separator) { ptrdiff_t length = s - string; result = vdo_allocate(length + 1, char, "split string", &substrings[current_substring]); if (result != VDO_SUCCESS) { free_string_array(substrings); return result; } /* * Trailing NUL is already in place after allocation; deal with the zero or * more non-NUL bytes in the string. */ if (length > 0) memcpy(substrings[current_substring], string, length); string = s + 1; current_substring++; BUG_ON(current_substring >= substring_count); } } /* Process final string, with no trailing separator. */ BUG_ON(current_substring != (substring_count - 1)); length = strlen(string); result = vdo_allocate(length + 1, char, "split string", &substrings[current_substring]); if (result != VDO_SUCCESS) { free_string_array(substrings); return result; } memcpy(substrings[current_substring], string, length); current_substring++; /* substrings[current_substring] is NULL already */ *substring_array_ptr = substrings; return VDO_SUCCESS; } /* * Join the input substrings into one string, joined with the indicated character, returning a * string. array_length is a bound on the number of valid elements in substring_array, in case it * is not NULL-terminated. */ static int join_strings(char **substring_array, size_t array_length, char separator, char **string_ptr) { size_t string_length = 0; size_t i; int result; char *output, *current_position; for (i = 0; (i < array_length) && (substring_array[i] != NULL); i++) string_length += strlen(substring_array[i]) + 1; result = vdo_allocate(string_length, char, __func__, &output); if (result != VDO_SUCCESS) return result; current_position = &output[0]; for (i = 0; (i < array_length) && (substring_array[i] != NULL); i++) { current_position = vdo_append_to_buffer(current_position, output + string_length, "%s", substring_array[i]); *current_position = separator; current_position++; } /* We output one too many separators; replace the last with a zero byte. */ if (current_position != output) *(current_position - 1) = '\0'; *string_ptr = output; return VDO_SUCCESS; } /** * parse_bool() - Parse a two-valued option into a bool. * @bool_str: The string value to convert to a bool. * @true_str: The string value which should be converted to true. * @false_str: The string value which should be converted to false. * @bool_ptr: A pointer to return the bool value in. * * Return: VDO_SUCCESS or an error if bool_str is neither true_str nor false_str. */ static inline int __must_check parse_bool(const char *bool_str, const char *true_str, const char *false_str, bool *bool_ptr) { bool value = false; if (strcmp(bool_str, true_str) == 0) value = true; else if (strcmp(bool_str, false_str) == 0) value = false; else return VDO_BAD_CONFIGURATION; *bool_ptr = value; return VDO_SUCCESS; } /** * process_one_thread_config_spec() - Process one component of a thread parameter configuration * string and update the configuration data structure. * @thread_param_type: The type of thread specified. * @count: The thread count requested. * @config: The configuration data structure to update. * * If the thread count requested is invalid, a message is logged and -EINVAL returned. If the * thread name is unknown, a message is logged but no error is returned. * * Return: VDO_SUCCESS or -EINVAL */ static int process_one_thread_config_spec(const char *thread_param_type, unsigned int count, struct thread_count_config *config) { /* Handle limited thread parameters */ if (strcmp(thread_param_type, "bioRotationInterval") == 0) { if (count == 0) { vdo_log_error("thread config string error: 'bioRotationInterval' of at least 1 is required"); return -EINVAL; } else if (count > VDO_BIO_ROTATION_INTERVAL_LIMIT) { vdo_log_error("thread config string error: 'bioRotationInterval' cannot be higher than %d", VDO_BIO_ROTATION_INTERVAL_LIMIT); return -EINVAL; } config->bio_rotation_interval = count; return VDO_SUCCESS; } if (strcmp(thread_param_type, "logical") == 0) { if (count > MAX_VDO_LOGICAL_ZONES) { vdo_log_error("thread config string error: at most %d 'logical' threads are allowed", MAX_VDO_LOGICAL_ZONES); return -EINVAL; } config->logical_zones = count; return VDO_SUCCESS; } if (strcmp(thread_param_type, "physical") == 0) { if (count > MAX_VDO_PHYSICAL_ZONES) { vdo_log_error("thread config string error: at most %d 'physical' threads are allowed", MAX_VDO_PHYSICAL_ZONES); return -EINVAL; } config->physical_zones = count; return VDO_SUCCESS; } /* Handle other thread count parameters */ if (count > MAXIMUM_VDO_THREADS) { vdo_log_error("thread config string error: at most %d '%s' threads are allowed", MAXIMUM_VDO_THREADS, thread_param_type); return -EINVAL; } if (strcmp(thread_param_type, "hash") == 0) { config->hash_zones = count; return VDO_SUCCESS; } if (strcmp(thread_param_type, "cpu") == 0) { if (count == 0) { vdo_log_error("thread config string error: at least one 'cpu' thread required"); return -EINVAL; } config->cpu_threads = count; return VDO_SUCCESS; } if (strcmp(thread_param_type, "ack") == 0) { config->bio_ack_threads = count; return VDO_SUCCESS; } if (strcmp(thread_param_type, "bio") == 0) { if (count == 0) { vdo_log_error("thread config string error: at least one 'bio' thread required"); return -EINVAL; } config->bio_threads = count; return VDO_SUCCESS; } /* * Don't fail, just log. This will handle version mismatches between user mode tools and * kernel. */ vdo_log_info("unknown thread parameter type \"%s\"", thread_param_type); return VDO_SUCCESS; } /** * parse_one_thread_config_spec() - Parse one component of a thread parameter configuration string * and update the configuration data structure. * @spec: The thread parameter specification string. * @config: The configuration data to be updated. */ static int parse_one_thread_config_spec(const char *spec, struct thread_count_config *config) { unsigned int count; char **fields; int result; result = split_string(spec, '=', &fields); if (result != VDO_SUCCESS) return result; if ((fields[0] == NULL) || (fields[1] == NULL) || (fields[2] != NULL)) { vdo_log_error("thread config string error: expected thread parameter assignment, saw \"%s\"", spec); free_string_array(fields); return -EINVAL; } result = kstrtouint(fields[1], 10, &count); if (result) { vdo_log_error("thread config string error: integer value needed, found \"%s\"", fields[1]); free_string_array(fields); return result; } result = process_one_thread_config_spec(fields[0], count, config); free_string_array(fields); return result; } /** * parse_thread_config_string() - Parse the configuration string passed and update the specified * counts and other parameters of various types of threads to be * created. * @string: Thread parameter configuration string. * @config: The thread configuration data to update. * * The configuration string should contain one or more comma-separated specs of the form * "typename=number"; the supported type names are "cpu", "ack", "bio", "bioRotationInterval", * "logical", "physical", and "hash". * * If an error occurs during parsing of a single key/value pair, we deem it serious enough to stop * further parsing. * * This function can't set the "reason" value the caller wants to pass back, because we'd want to * format it to say which field was invalid, and we can't allocate the "reason" strings * dynamically. So if an error occurs, we'll log the details and pass back an error. * * Return: VDO_SUCCESS or -EINVAL or -ENOMEM */ static int parse_thread_config_string(const char *string, struct thread_count_config *config) { int result = VDO_SUCCESS; char **specs; if (strcmp(".", string) != 0) { unsigned int i; result = split_string(string, ',', &specs); if (result != VDO_SUCCESS) return result; for (i = 0; specs[i] != NULL; i++) { result = parse_one_thread_config_spec(specs[i], config); if (result != VDO_SUCCESS) break; } free_string_array(specs); } return result; } /** * process_one_key_value_pair() - Process one component of an optional parameter string and update * the configuration data structure. * @key: The optional parameter key name. * @value: The optional parameter value. * @config: The configuration data structure to update. * * If the value requested is invalid, a message is logged and -EINVAL returned. If the key is * unknown, a message is logged but no error is returned. * * Return: VDO_SUCCESS or -EINVAL */ static int process_one_key_value_pair(const char *key, unsigned int value, struct device_config *config) { /* Non thread optional parameters */ if (strcmp(key, "maxDiscard") == 0) { if (value == 0) { vdo_log_error("optional parameter error: at least one max discard block required"); return -EINVAL; } /* Max discard sectors in blkdev_issue_discard is UINT_MAX >> 9 */ if (value > (UINT_MAX / VDO_BLOCK_SIZE)) { vdo_log_error("optional parameter error: at most %d max discard blocks are allowed", UINT_MAX / VDO_BLOCK_SIZE); return -EINVAL; } config->max_discard_blocks = value; return VDO_SUCCESS; } /* Handles unknown key names */ return process_one_thread_config_spec(key, value, &config->thread_counts); } /** * parse_one_key_value_pair() - Parse one key/value pair and update the configuration data * structure. * @key: The optional key name. * @value: The optional value. * @config: The configuration data to be updated. * * Return: VDO_SUCCESS or error. */ static int parse_one_key_value_pair(const char *key, const char *value, struct device_config *config) { unsigned int count; int result; if (strcmp(key, "deduplication") == 0) return parse_bool(value, "on", "off", &config->deduplication); if (strcmp(key, "compression") == 0) return parse_bool(value, "on", "off", &config->compression); /* The remaining arguments must have integral values. */ result = kstrtouint(value, 10, &count); if (result) { vdo_log_error("optional config string error: integer value needed, found \"%s\"", value); return result; } return process_one_key_value_pair(key, count, config); } /** * parse_key_value_pairs() - Parse all key/value pairs from a list of arguments. * @argc: The total number of arguments in list. * @argv: The list of key/value pairs. * @config: The device configuration data to update. * * If an error occurs during parsing of a single key/value pair, we deem it serious enough to stop * further parsing. * * This function can't set the "reason" value the caller wants to pass back, because we'd want to * format it to say which field was invalid, and we can't allocate the "reason" strings * dynamically. So if an error occurs, we'll log the details and return the error. * * Return: VDO_SUCCESS or error */ static int parse_key_value_pairs(int argc, char **argv, struct device_config *config) { int result = VDO_SUCCESS; while (argc) { result = parse_one_key_value_pair(argv[0], argv[1], config); if (result != VDO_SUCCESS) break; argc -= 2; argv += 2; } return result; } /** * parse_optional_arguments() - Parse the configuration string passed in for optional arguments. * @arg_set: The structure holding the arguments to parse. * @error_ptr: Pointer to a buffer to hold the error string. * @config: Pointer to device configuration data to update. * * For V0/V1 configurations, there will only be one optional parameter; the thread configuration. * The configuration string should contain one or more comma-separated specs of the form * "typename=number"; the supported type names are "cpu", "ack", "bio", "bioRotationInterval", * "logical", "physical", and "hash". * * For V2 configurations and beyond, there could be any number of arguments. They should contain * one or more key/value pairs separated by a space. * * Return: VDO_SUCCESS or error */ static int parse_optional_arguments(struct dm_arg_set *arg_set, char **error_ptr, struct device_config *config) { int result = VDO_SUCCESS; if (config->version == 0 || config->version == 1) { result = parse_thread_config_string(arg_set->argv[0], &config->thread_counts); if (result != VDO_SUCCESS) { *error_ptr = "Invalid thread-count configuration"; return VDO_BAD_CONFIGURATION; } } else { if ((arg_set->argc % 2) != 0) { *error_ptr = "Odd number of optional arguments given but they should be pairs"; return VDO_BAD_CONFIGURATION; } result = parse_key_value_pairs(arg_set->argc, arg_set->argv, config); if (result != VDO_SUCCESS) { *error_ptr = "Invalid optional argument configuration"; return VDO_BAD_CONFIGURATION; } } return result; } /** * handle_parse_error() - Handle a parsing error. * @config: The config to free. * @error_ptr: A place to store a constant string about the error. * @error_str: A constant string to store in error_ptr. */ static void handle_parse_error(struct device_config *config, char **error_ptr, char *error_str) { free_device_config(config); *error_ptr = error_str; } /** * parse_device_config() - Convert the dmsetup table into a struct device_config. * @argc: The number of table values. * @argv: The array of table values. * @ti: The target structure for this table. * @config_ptr: A pointer to return the allocated config. * * Return: VDO_SUCCESS or an error code. */ static int parse_device_config(int argc, char **argv, struct dm_target *ti, struct device_config **config_ptr) { bool enable_512e; size_t logical_bytes = to_bytes(ti->len); struct dm_arg_set arg_set; char **error_ptr = &ti->error; struct device_config *config = NULL; int result; if ((logical_bytes % VDO_BLOCK_SIZE) != 0) { handle_parse_error(config, error_ptr, "Logical size must be a multiple of 4096"); return VDO_BAD_CONFIGURATION; } if (argc == 0) { handle_parse_error(config, error_ptr, "Incorrect number of arguments"); return VDO_BAD_CONFIGURATION; } result = vdo_allocate(1, struct device_config, "device_config", &config); if (result != VDO_SUCCESS) { handle_parse_error(config, error_ptr, "Could not allocate config structure"); return VDO_BAD_CONFIGURATION; } config->owning_target = ti; config->logical_blocks = logical_bytes / VDO_BLOCK_SIZE; INIT_LIST_HEAD(&config->config_list); /* Save the original string. */ result = join_strings(argv, argc, ' ', &config->original_string); if (result != VDO_SUCCESS) { handle_parse_error(config, error_ptr, "Could not populate string"); return VDO_BAD_CONFIGURATION; } vdo_log_info("table line: %s", config->original_string); config->thread_counts = (struct thread_count_config) { .bio_ack_threads = 1, .bio_threads = DEFAULT_VDO_BIO_SUBMIT_QUEUE_COUNT, .bio_rotation_interval = DEFAULT_VDO_BIO_SUBMIT_QUEUE_ROTATE_INTERVAL, .cpu_threads = 1, .logical_zones = 0, .physical_zones = 0, .hash_zones = 0, }; config->max_discard_blocks = 1; config->deduplication = true; config->compression = false; arg_set.argc = argc; arg_set.argv = argv; result = get_version_number(argc, argv, error_ptr, &config->version); if (result != VDO_SUCCESS) { /* get_version_number sets error_ptr itself. */ handle_parse_error(config, error_ptr, *error_ptr); return result; } /* Move the arg pointer forward only if the argument was there. */ if (config->version >= 1) dm_shift_arg(&arg_set); result = vdo_duplicate_string(dm_shift_arg(&arg_set), "parent device name", &config->parent_device_name); if (result != VDO_SUCCESS) { handle_parse_error(config, error_ptr, "Could not copy parent device name"); return VDO_BAD_CONFIGURATION; } /* Get the physical blocks, if known. */ if (config->version >= 1) { result = kstrtoull(dm_shift_arg(&arg_set), 10, &config->physical_blocks); if (result != VDO_SUCCESS) { handle_parse_error(config, error_ptr, "Invalid physical block count"); return VDO_BAD_CONFIGURATION; } } /* Get the logical block size and validate */ result = parse_bool(dm_shift_arg(&arg_set), "512", "4096", &enable_512e); if (result != VDO_SUCCESS) { handle_parse_error(config, error_ptr, "Invalid logical block size"); return VDO_BAD_CONFIGURATION; } config->logical_block_size = (enable_512e ? 512 : 4096); /* Skip past the two no longer used read cache options. */ if (config->version <= 1) dm_consume_args(&arg_set, 2); /* Get the page cache size. */ result = kstrtouint(dm_shift_arg(&arg_set), 10, &config->cache_size); if (result != VDO_SUCCESS) { handle_parse_error(config, error_ptr, "Invalid block map page cache size"); return VDO_BAD_CONFIGURATION; } /* Get the block map era length. */ result = kstrtouint(dm_shift_arg(&arg_set), 10, &config->block_map_maximum_age); if (result != VDO_SUCCESS) { handle_parse_error(config, error_ptr, "Invalid block map maximum age"); return VDO_BAD_CONFIGURATION; } /* Skip past the no longer used MD RAID5 optimization mode */ if (config->version <= 2) dm_consume_args(&arg_set, 1); /* Skip past the no longer used write policy setting */ if (config->version <= 3) dm_consume_args(&arg_set, 1); /* Skip past the no longer used pool name for older table lines */ if (config->version <= 2) { /* * Make sure the enum to get the pool name from argv directly is still in sync with * the parsing of the table line. */ if (&arg_set.argv[0] != &argv[POOL_NAME_ARG_INDEX[config->version]]) { handle_parse_error(config, error_ptr, "Pool name not in expected location"); return VDO_BAD_CONFIGURATION; } dm_shift_arg(&arg_set); } /* Get the optional arguments and validate. */ result = parse_optional_arguments(&arg_set, error_ptr, config); if (result != VDO_SUCCESS) { /* parse_optional_arguments sets error_ptr itself. */ handle_parse_error(config, error_ptr, *error_ptr); return result; } /* * Logical, physical, and hash zone counts can all be zero; then we get one thread doing * everything, our older configuration. If any zone count is non-zero, the others must be * as well. */ if (((config->thread_counts.logical_zones == 0) != (config->thread_counts.physical_zones == 0)) || ((config->thread_counts.physical_zones == 0) != (config->thread_counts.hash_zones == 0))) { handle_parse_error(config, error_ptr, "Logical, physical, and hash zones counts must all be zero or all non-zero"); return VDO_BAD_CONFIGURATION; } if (config->cache_size < (2 * MAXIMUM_VDO_USER_VIOS * config->thread_counts.logical_zones)) { handle_parse_error(config, error_ptr, "Insufficient block map cache for logical zones"); return VDO_BAD_CONFIGURATION; } result = dm_get_device(ti, config->parent_device_name, dm_table_get_mode(ti->table), &config->owned_device); if (result != 0) { vdo_log_error("couldn't open device \"%s\": error %d", config->parent_device_name, result); handle_parse_error(config, error_ptr, "Unable to open storage device"); return VDO_BAD_CONFIGURATION; } if (config->version == 0) { u64 device_size = bdev_nr_bytes(config->owned_device->bdev); config->physical_blocks = device_size / VDO_BLOCK_SIZE; } *config_ptr = config; return result; } static struct vdo *get_vdo_for_target(struct dm_target *ti) { return ((struct device_config *) ti->private)->vdo; } static int vdo_map_bio(struct dm_target *ti, struct bio *bio) { struct vdo *vdo = get_vdo_for_target(ti); struct vdo_work_queue *current_work_queue; const struct admin_state_code *code = vdo_get_admin_state_code(&vdo->admin.state); VDO_ASSERT_LOG_ONLY(code->normal, "vdo should not receive bios while in state %s", code->name); /* Count all incoming bios. */ vdo_count_bios(&vdo->stats.bios_in, bio); /* Handle empty bios. Empty flush bios are not associated with a vio. */ if ((bio_op(bio) == REQ_OP_FLUSH) || ((bio->bi_opf & REQ_PREFLUSH) != 0)) { vdo_launch_flush(vdo, bio); return DM_MAPIO_SUBMITTED; } /* This could deadlock, */ current_work_queue = vdo_get_current_work_queue(); BUG_ON((current_work_queue != NULL) && (vdo == vdo_get_work_queue_owner(current_work_queue)->vdo)); vdo_launch_bio(vdo->data_vio_pool, bio); return DM_MAPIO_SUBMITTED; } static void vdo_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct vdo *vdo = get_vdo_for_target(ti); limits->logical_block_size = vdo->device_config->logical_block_size; limits->physical_block_size = VDO_BLOCK_SIZE; /* The minimum io size for random io */ limits->io_min = VDO_BLOCK_SIZE; /* The optimal io size for streamed/sequential io */ limits->io_opt = VDO_BLOCK_SIZE; /* * Sets the maximum discard size that will be passed into VDO. This value comes from a * table line value passed in during dmsetup create. * * The value 1024 is the largest usable value on HD systems. A 2048 sector discard on a * busy HD system takes 31 seconds. We should use a value no higher than 1024, which takes * 15 to 16 seconds on a busy HD system. However, using large values results in 120 second * blocked task warnings in kernel logs. In order to avoid these warnings, we choose to * use the smallest reasonable value. * * The value is used by dm-thin to determine whether to pass down discards. The block layer * splits large discards on this boundary when this is set. */ limits->max_hw_discard_sectors = (vdo->device_config->max_discard_blocks * VDO_SECTORS_PER_BLOCK); /* * Force discards to not begin or end with a partial block by stating the granularity is * 4k. */ limits->discard_granularity = VDO_BLOCK_SIZE; } static int vdo_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { struct device_config *config = get_vdo_for_target(ti)->device_config; return fn(ti, config->owned_device, 0, config->physical_blocks * VDO_SECTORS_PER_BLOCK, data); } /* * Status line is: * * */ static void vdo_status(struct dm_target *ti, status_type_t status_type, unsigned int status_flags, char *result, unsigned int maxlen) { struct vdo *vdo = get_vdo_for_target(ti); struct vdo_statistics *stats; struct device_config *device_config; /* N.B.: The DMEMIT macro uses the variables named "sz", "result", "maxlen". */ int sz = 0; switch (status_type) { case STATUSTYPE_INFO: /* Report info for dmsetup status */ mutex_lock(&vdo->stats_mutex); vdo_fetch_statistics(vdo, &vdo->stats_buffer); stats = &vdo->stats_buffer; DMEMIT("/dev/%pg %s %s %s %s %llu %llu", vdo_get_backing_device(vdo), stats->mode, stats->in_recovery_mode ? "recovering" : "-", vdo_get_dedupe_index_state_name(vdo->hash_zones), vdo_get_compressing(vdo) ? "online" : "offline", stats->data_blocks_used + stats->overhead_blocks_used, stats->physical_blocks); mutex_unlock(&vdo->stats_mutex); break; case STATUSTYPE_TABLE: /* Report the string actually specified in the beginning. */ device_config = (struct device_config *) ti->private; DMEMIT("%s", device_config->original_string); break; case STATUSTYPE_IMA: /* FIXME: We ought to be more detailed here, but this is what thin does. */ *result = '\0'; break; } } static block_count_t __must_check get_underlying_device_block_count(const struct vdo *vdo) { return bdev_nr_bytes(vdo_get_backing_device(vdo)) / VDO_BLOCK_SIZE; } static int __must_check process_vdo_message_locked(struct vdo *vdo, unsigned int argc, char **argv) { if ((argc == 2) && (strcasecmp(argv[0], "compression") == 0)) { if (strcasecmp(argv[1], "on") == 0) { vdo_set_compressing(vdo, true); return 0; } if (strcasecmp(argv[1], "off") == 0) { vdo_set_compressing(vdo, false); return 0; } vdo_log_warning("invalid argument '%s' to dmsetup compression message", argv[1]); return -EINVAL; } vdo_log_warning("unrecognized dmsetup message '%s' received", argv[0]); return -EINVAL; } /* * If the message is a dump, just do it. Otherwise, check that no other message is being processed, * and only proceed if so. * Returns -EBUSY if another message is being processed */ static int __must_check process_vdo_message(struct vdo *vdo, unsigned int argc, char **argv) { int result; /* * All messages which may be processed in parallel with other messages should be handled * here before the atomic check below. Messages which should be exclusive should be * processed in process_vdo_message_locked(). */ /* Dump messages should always be processed */ if (strcasecmp(argv[0], "dump") == 0) return vdo_dump(vdo, argc, argv, "dmsetup message"); if (argc == 1) { if (strcasecmp(argv[0], "dump-on-shutdown") == 0) { vdo->dump_on_shutdown = true; return 0; } /* Index messages should always be processed */ if ((strcasecmp(argv[0], "index-close") == 0) || (strcasecmp(argv[0], "index-create") == 0) || (strcasecmp(argv[0], "index-disable") == 0) || (strcasecmp(argv[0], "index-enable") == 0)) return vdo_message_dedupe_index(vdo->hash_zones, argv[0]); } if (atomic_cmpxchg(&vdo->processing_message, 0, 1) != 0) return -EBUSY; result = process_vdo_message_locked(vdo, argc, argv); /* Pairs with the implicit barrier in cmpxchg just above */ smp_wmb(); atomic_set(&vdo->processing_message, 0); return result; } static int vdo_message(struct dm_target *ti, unsigned int argc, char **argv, char *result_buffer, unsigned int maxlen) { struct registered_thread allocating_thread, instance_thread; struct vdo *vdo; int result; if (argc == 0) { vdo_log_warning("unspecified dmsetup message"); return -EINVAL; } vdo = get_vdo_for_target(ti); vdo_register_allocating_thread(&allocating_thread, NULL); vdo_register_thread_device_id(&instance_thread, &vdo->instance); /* * Must be done here so we don't map return codes. The code in dm-ioctl expects a 1 for a * return code to look at the buffer and see if it is full or not. */ if ((argc == 1) && (strcasecmp(argv[0], "stats") == 0)) { vdo_write_stats(vdo, result_buffer, maxlen); result = 1; } else if ((argc == 1) && (strcasecmp(argv[0], "config") == 0)) { vdo_write_config(vdo, &result_buffer, &maxlen); result = 1; } else { result = vdo_status_to_errno(process_vdo_message(vdo, argc, argv)); } vdo_unregister_thread_device_id(); vdo_unregister_allocating_thread(); return result; } static void configure_target_capabilities(struct dm_target *ti) { ti->discards_supported = 1; ti->flush_supported = true; ti->num_discard_bios = 1; ti->num_flush_bios = 1; /* * If this value changes, please make sure to update the value for max_discard_sectors * accordingly. */ BUG_ON(dm_set_target_max_io_len(ti, VDO_SECTORS_PER_BLOCK) != 0); } /* * Implements vdo_filter_fn. */ static bool vdo_uses_device(struct vdo *vdo, const void *context) { const struct device_config *config = context; return vdo_get_backing_device(vdo)->bd_dev == config->owned_device->bdev->bd_dev; } /** * get_thread_id_for_phase() - Get the thread id for the current phase of the admin operation in * progress. */ static thread_id_t __must_check get_thread_id_for_phase(struct vdo *vdo) { switch (vdo->admin.phase) { case RESUME_PHASE_PACKER: case RESUME_PHASE_FLUSHER: case SUSPEND_PHASE_PACKER: case SUSPEND_PHASE_FLUSHES: return vdo->thread_config.packer_thread; case RESUME_PHASE_DATA_VIOS: case SUSPEND_PHASE_DATA_VIOS: return vdo->thread_config.cpu_thread; case LOAD_PHASE_DRAIN_JOURNAL: case RESUME_PHASE_JOURNAL: case SUSPEND_PHASE_JOURNAL: return vdo->thread_config.journal_thread; default: return vdo->thread_config.admin_thread; } } static struct vdo_completion *prepare_admin_completion(struct vdo *vdo, vdo_action_fn callback, vdo_action_fn error_handler) { struct vdo_completion *completion = &vdo->admin.completion; /* * We can't use vdo_prepare_completion_for_requeue() here because we don't want to reset * any error in the completion. */ completion->callback = callback; completion->error_handler = error_handler; completion->callback_thread_id = get_thread_id_for_phase(vdo); completion->requeue = true; return completion; } /** * advance_phase() - Increment the phase of the current admin operation and prepare the admin * completion to run on the thread for the next phase. * @vdo: The on which an admin operation is being performed * * Return: The current phase */ static u32 advance_phase(struct vdo *vdo) { u32 phase = vdo->admin.phase++; vdo->admin.completion.callback_thread_id = get_thread_id_for_phase(vdo); vdo->admin.completion.requeue = true; return phase; } /* * Perform an administrative operation (load, suspend, grow logical, or grow physical). This method * should not be called from vdo threads. */ static int perform_admin_operation(struct vdo *vdo, u32 starting_phase, vdo_action_fn callback, vdo_action_fn error_handler, const char *type) { int result; struct vdo_administrator *admin = &vdo->admin; if (atomic_cmpxchg(&admin->busy, 0, 1) != 0) { return vdo_log_error_strerror(VDO_COMPONENT_BUSY, "Can't start %s operation, another operation is already in progress", type); } admin->phase = starting_phase; reinit_completion(&admin->callback_sync); vdo_reset_completion(&admin->completion); vdo_launch_completion(prepare_admin_completion(vdo, callback, error_handler)); /* * Using the "interruptible" interface means that Linux will not log a message when we wait * for more than 120 seconds. */ while (wait_for_completion_interruptible(&admin->callback_sync)) { /* However, if we get a signal in a user-mode process, we could spin... */ fsleep(1000); } result = admin->completion.result; /* pairs with implicit barrier in cmpxchg above */ smp_wmb(); atomic_set(&admin->busy, 0); return result; } /* Assert that we are operating on the correct thread for the current phase. */ static void assert_admin_phase_thread(struct vdo *vdo, const char *what) { VDO_ASSERT_LOG_ONLY(vdo_get_callback_thread_id() == get_thread_id_for_phase(vdo), "%s on correct thread for %s", what, ADMIN_PHASE_NAMES[vdo->admin.phase]); } /** * finish_operation_callback() - Callback to finish an admin operation. * @completion: The admin_completion. */ static void finish_operation_callback(struct vdo_completion *completion) { struct vdo_administrator *admin = &completion->vdo->admin; vdo_finish_operation(&admin->state, completion->result); complete(&admin->callback_sync); } /** * decode_from_super_block() - Decode the VDO state from the super block and validate that it is * correct. * @vdo: The vdo being loaded. * * On error from this method, the component states must be destroyed explicitly. If this method * returns successfully, the component states must not be destroyed. * * Return: VDO_SUCCESS or an error. */ static int __must_check decode_from_super_block(struct vdo *vdo) { const struct device_config *config = vdo->device_config; int result; result = vdo_decode_component_states(vdo->super_block.buffer, &vdo->geometry, &vdo->states); if (result != VDO_SUCCESS) return result; vdo_set_state(vdo, vdo->states.vdo.state); vdo->load_state = vdo->states.vdo.state; /* * If the device config specifies a larger logical size than was recorded in the super * block, just accept it. */ if (vdo->states.vdo.config.logical_blocks < config->logical_blocks) { vdo_log_warning("Growing logical size: a logical size of %llu blocks was specified, but that differs from the %llu blocks configured in the vdo super block", (unsigned long long) config->logical_blocks, (unsigned long long) vdo->states.vdo.config.logical_blocks); vdo->states.vdo.config.logical_blocks = config->logical_blocks; } result = vdo_validate_component_states(&vdo->states, vdo->geometry.nonce, config->physical_blocks, config->logical_blocks); if (result != VDO_SUCCESS) return result; vdo->layout = vdo->states.layout; return VDO_SUCCESS; } /** * decode_vdo() - Decode the component data portion of a super block and fill in the corresponding * portions of the vdo being loaded. * @vdo: The vdo being loaded. * * This will also allocate the recovery journal and slab depot. If this method is called with an * asynchronous layer (i.e. a thread config which specifies at least one base thread), the block * map and packer will be constructed as well. * * Return: VDO_SUCCESS or an error. */ static int __must_check decode_vdo(struct vdo *vdo) { block_count_t maximum_age, journal_length; struct partition *partition; int result; result = decode_from_super_block(vdo); if (result != VDO_SUCCESS) { vdo_destroy_component_states(&vdo->states); return result; } maximum_age = vdo_convert_maximum_age(vdo->device_config->block_map_maximum_age); journal_length = vdo_get_recovery_journal_length(vdo->states.vdo.config.recovery_journal_size); if (maximum_age > (journal_length / 2)) { return vdo_log_error_strerror(VDO_BAD_CONFIGURATION, "maximum age: %llu exceeds limit %llu", (unsigned long long) maximum_age, (unsigned long long) (journal_length / 2)); } if (maximum_age == 0) { return vdo_log_error_strerror(VDO_BAD_CONFIGURATION, "maximum age must be greater than 0"); } result = vdo_enable_read_only_entry(vdo); if (result != VDO_SUCCESS) return result; partition = vdo_get_known_partition(&vdo->layout, VDO_RECOVERY_JOURNAL_PARTITION); result = vdo_decode_recovery_journal(vdo->states.recovery_journal, vdo->states.vdo.nonce, vdo, partition, vdo->states.vdo.complete_recoveries, vdo->states.vdo.config.recovery_journal_size, &vdo->recovery_journal); if (result != VDO_SUCCESS) return result; partition = vdo_get_known_partition(&vdo->layout, VDO_SLAB_SUMMARY_PARTITION); result = vdo_decode_slab_depot(vdo->states.slab_depot, vdo, partition, &vdo->depot); if (result != VDO_SUCCESS) return result; result = vdo_decode_block_map(vdo->states.block_map, vdo->states.vdo.config.logical_blocks, vdo, vdo->recovery_journal, vdo->states.vdo.nonce, vdo->device_config->cache_size, maximum_age, &vdo->block_map); if (result != VDO_SUCCESS) return result; result = vdo_make_physical_zones(vdo, &vdo->physical_zones); if (result != VDO_SUCCESS) return result; /* The logical zones depend on the physical zones already existing. */ result = vdo_make_logical_zones(vdo, &vdo->logical_zones); if (result != VDO_SUCCESS) return result; return vdo_make_hash_zones(vdo, &vdo->hash_zones); } /** * pre_load_callback() - Callback to initiate a pre-load, registered in vdo_initialize(). * @completion: The admin completion. */ static void pre_load_callback(struct vdo_completion *completion) { struct vdo *vdo = completion->vdo; int result; assert_admin_phase_thread(vdo, __func__); switch (advance_phase(vdo)) { case PRE_LOAD_PHASE_START: result = vdo_start_operation(&vdo->admin.state, VDO_ADMIN_STATE_PRE_LOADING); if (result != VDO_SUCCESS) { vdo_continue_completion(completion, result); return; } vdo_load_super_block(vdo, completion); return; case PRE_LOAD_PHASE_LOAD_COMPONENTS: vdo_continue_completion(completion, decode_vdo(vdo)); return; case PRE_LOAD_PHASE_END: break; default: vdo_set_completion_result(completion, UDS_BAD_STATE); } finish_operation_callback(completion); } static void release_instance(unsigned int instance) { mutex_lock(&instances_lock); if (instance >= instances.bit_count) { VDO_ASSERT_LOG_ONLY(false, "instance number %u must be less than bit count %u", instance, instances.bit_count); } else if (test_bit(instance, instances.words) == 0) { VDO_ASSERT_LOG_ONLY(false, "instance number %u must be allocated", instance); } else { __clear_bit(instance, instances.words); instances.count -= 1; } mutex_unlock(&instances_lock); } static void set_device_config(struct dm_target *ti, struct vdo *vdo, struct device_config *config) { list_del_init(&config->config_list); list_add_tail(&config->config_list, &vdo->device_config_list); config->vdo = vdo; ti->private = config; configure_target_capabilities(ti); } static int vdo_initialize(struct dm_target *ti, unsigned int instance, struct device_config *config) { struct vdo *vdo; int result; u64 block_size = VDO_BLOCK_SIZE; u64 logical_size = to_bytes(ti->len); block_count_t logical_blocks = logical_size / block_size; vdo_log_info("loading device '%s'", vdo_get_device_name(ti)); vdo_log_debug("Logical block size = %llu", (u64) config->logical_block_size); vdo_log_debug("Logical blocks = %llu", logical_blocks); vdo_log_debug("Physical block size = %llu", (u64) block_size); vdo_log_debug("Physical blocks = %llu", config->physical_blocks); vdo_log_debug("Block map cache blocks = %u", config->cache_size); vdo_log_debug("Block map maximum age = %u", config->block_map_maximum_age); vdo_log_debug("Deduplication = %s", (config->deduplication ? "on" : "off")); vdo_log_debug("Compression = %s", (config->compression ? "on" : "off")); vdo = vdo_find_matching(vdo_uses_device, config); if (vdo != NULL) { vdo_log_error("Existing vdo already uses device %s", vdo->device_config->parent_device_name); ti->error = "Cannot share storage device with already-running VDO"; return VDO_BAD_CONFIGURATION; } result = vdo_make(instance, config, &ti->error, &vdo); if (result != VDO_SUCCESS) { vdo_log_error("Could not create VDO device. (VDO error %d, message %s)", result, ti->error); vdo_destroy(vdo); return result; } result = perform_admin_operation(vdo, PRE_LOAD_PHASE_START, pre_load_callback, finish_operation_callback, "pre-load"); if (result != VDO_SUCCESS) { ti->error = ((result == VDO_INVALID_ADMIN_STATE) ? "Pre-load is only valid immediately after initialization" : "Cannot load metadata from device"); vdo_log_error("Could not start VDO device. (VDO error %d, message %s)", result, ti->error); vdo_destroy(vdo); return result; } set_device_config(ti, vdo, config); vdo->device_config = config; return VDO_SUCCESS; } /* Implements vdo_filter_fn. */ static bool __must_check vdo_is_named(struct vdo *vdo, const void *context) { struct dm_target *ti = vdo->device_config->owning_target; const char *device_name = vdo_get_device_name(ti); return strcmp(device_name, context) == 0; } /** * get_bit_array_size() - Return the number of bytes needed to store a bit array of the specified * capacity in an array of unsigned longs. * @bit_count: The number of bits the array must hold. * * Return: the number of bytes needed for the array representation. */ static size_t get_bit_array_size(unsigned int bit_count) { /* Round up to a multiple of the word size and convert to a byte count. */ return (BITS_TO_LONGS(bit_count) * sizeof(unsigned long)); } /** * grow_bit_array() - Re-allocate the bitmap word array so there will more instance numbers that * can be allocated. * * Since the array is initially NULL, this also initializes the array the first time we allocate an * instance number. * * Return: VDO_SUCCESS or an error code from the allocation */ static int grow_bit_array(void) { unsigned int new_count = max(instances.bit_count + BIT_COUNT_INCREMENT, (unsigned int) BIT_COUNT_MINIMUM); unsigned long *new_words; int result; result = vdo_reallocate_memory(instances.words, get_bit_array_size(instances.bit_count), get_bit_array_size(new_count), "instance number bit array", &new_words); if (result != VDO_SUCCESS) return result; instances.bit_count = new_count; instances.words = new_words; return VDO_SUCCESS; } /** * allocate_instance() - Allocate an instance number. * @instance_ptr: A point to hold the instance number * * Return: VDO_SUCCESS or an error code * * This function must be called while holding the instances lock. */ static int allocate_instance(unsigned int *instance_ptr) { unsigned int instance; int result; /* If there are no unallocated instances, grow the bit array. */ if (instances.count >= instances.bit_count) { result = grow_bit_array(); if (result != VDO_SUCCESS) return result; } /* * There must be a zero bit somewhere now. Find it, starting just after the last instance * allocated. */ instance = find_next_zero_bit(instances.words, instances.bit_count, instances.next); if (instance >= instances.bit_count) { /* Nothing free after next, so wrap around to instance zero. */ instance = find_first_zero_bit(instances.words, instances.bit_count); result = VDO_ASSERT(instance < instances.bit_count, "impossibly, no zero bit found"); if (result != VDO_SUCCESS) return result; } __set_bit(instance, instances.words); instances.count++; instances.next = instance + 1; *instance_ptr = instance; return VDO_SUCCESS; } static int construct_new_vdo_registered(struct dm_target *ti, unsigned int argc, char **argv, unsigned int instance) { int result; struct device_config *config; result = parse_device_config(argc, argv, ti, &config); if (result != VDO_SUCCESS) { vdo_log_error_strerror(result, "parsing failed: %s", ti->error); release_instance(instance); return -EINVAL; } /* Beyond this point, the instance number will be cleaned up for us if needed */ result = vdo_initialize(ti, instance, config); if (result != VDO_SUCCESS) { release_instance(instance); free_device_config(config); return vdo_status_to_errno(result); } return VDO_SUCCESS; } static int construct_new_vdo(struct dm_target *ti, unsigned int argc, char **argv) { int result; unsigned int instance; struct registered_thread instance_thread; mutex_lock(&instances_lock); result = allocate_instance(&instance); mutex_unlock(&instances_lock); if (result != VDO_SUCCESS) return -ENOMEM; vdo_register_thread_device_id(&instance_thread, &instance); result = construct_new_vdo_registered(ti, argc, argv, instance); vdo_unregister_thread_device_id(); return result; } /** * check_may_grow_physical() - Callback to check that we're not in recovery mode, used in * vdo_prepare_to_grow_physical(). * @completion: The admin completion. */ static void check_may_grow_physical(struct vdo_completion *completion) { struct vdo *vdo = completion->vdo; assert_admin_phase_thread(vdo, __func__); /* These checks can only be done from a vdo thread. */ if (vdo_is_read_only(vdo)) vdo_set_completion_result(completion, VDO_READ_ONLY); if (vdo_in_recovery_mode(vdo)) vdo_set_completion_result(completion, VDO_RETRY_AFTER_REBUILD); finish_operation_callback(completion); } static block_count_t get_partition_size(struct layout *layout, enum partition_id id) { return vdo_get_known_partition(layout, id)->count; } /** * grow_layout() - Make the layout for growing a vdo. * @vdo: The vdo preparing to grow. * @old_size: The current size of the vdo. * @new_size: The size to which the vdo will be grown. * * Return: VDO_SUCCESS or an error code. */ static int grow_layout(struct vdo *vdo, block_count_t old_size, block_count_t new_size) { int result; block_count_t min_new_size; if (vdo->next_layout.size == new_size) { /* We are already prepared to grow to the new size, so we're done. */ return VDO_SUCCESS; } /* Make a copy completion if there isn't one */ if (vdo->partition_copier == NULL) { vdo->partition_copier = dm_kcopyd_client_create(NULL); if (IS_ERR(vdo->partition_copier)) { result = PTR_ERR(vdo->partition_copier); vdo->partition_copier = NULL; return result; } } /* Free any unused preparation. */ vdo_uninitialize_layout(&vdo->next_layout); /* * Make a new layout with the existing partition sizes for everything but the slab depot * partition. */ result = vdo_initialize_layout(new_size, vdo->layout.start, get_partition_size(&vdo->layout, VDO_BLOCK_MAP_PARTITION), get_partition_size(&vdo->layout, VDO_RECOVERY_JOURNAL_PARTITION), get_partition_size(&vdo->layout, VDO_SLAB_SUMMARY_PARTITION), &vdo->next_layout); if (result != VDO_SUCCESS) { dm_kcopyd_client_destroy(vdo_forget(vdo->partition_copier)); return result; } /* Ensure the new journal and summary are entirely within the added blocks. */ min_new_size = (old_size + get_partition_size(&vdo->next_layout, VDO_SLAB_SUMMARY_PARTITION) + get_partition_size(&vdo->next_layout, VDO_RECOVERY_JOURNAL_PARTITION)); if (min_new_size > new_size) { /* Copying the journal and summary would destroy some old metadata. */ vdo_uninitialize_layout(&vdo->next_layout); dm_kcopyd_client_destroy(vdo_forget(vdo->partition_copier)); return VDO_INCREMENT_TOO_SMALL; } return VDO_SUCCESS; } static int prepare_to_grow_physical(struct vdo *vdo, block_count_t new_physical_blocks) { int result; block_count_t current_physical_blocks = vdo->states.vdo.config.physical_blocks; vdo_log_info("Preparing to resize physical to %llu", (unsigned long long) new_physical_blocks); VDO_ASSERT_LOG_ONLY((new_physical_blocks > current_physical_blocks), "New physical size is larger than current physical size"); result = perform_admin_operation(vdo, PREPARE_GROW_PHYSICAL_PHASE_START, check_may_grow_physical, finish_operation_callback, "prepare grow-physical"); if (result != VDO_SUCCESS) return result; result = grow_layout(vdo, current_physical_blocks, new_physical_blocks); if (result != VDO_SUCCESS) return result; result = vdo_prepare_to_grow_slab_depot(vdo->depot, vdo_get_known_partition(&vdo->next_layout, VDO_SLAB_DEPOT_PARTITION)); if (result != VDO_SUCCESS) { vdo_uninitialize_layout(&vdo->next_layout); return result; } vdo_log_info("Done preparing to resize physical"); return VDO_SUCCESS; } /** * validate_new_device_config() - Check whether a new device config represents a valid modification * to an existing config. * @to_validate: The new config to validate. * @config: The existing config. * @may_grow: Set to true if growing the logical and physical size of the vdo is currently * permitted. * @error_ptr: A pointer to hold the reason for any error. * * Return: VDO_SUCCESS or an error. */ static int validate_new_device_config(struct device_config *to_validate, struct device_config *config, bool may_grow, char **error_ptr) { if (to_validate->owning_target->begin != config->owning_target->begin) { *error_ptr = "Starting sector cannot change"; return VDO_PARAMETER_MISMATCH; } if (to_validate->logical_block_size != config->logical_block_size) { *error_ptr = "Logical block size cannot change"; return VDO_PARAMETER_MISMATCH; } if (to_validate->logical_blocks < config->logical_blocks) { *error_ptr = "Can't shrink VDO logical size"; return VDO_PARAMETER_MISMATCH; } if (to_validate->cache_size != config->cache_size) { *error_ptr = "Block map cache size cannot change"; return VDO_PARAMETER_MISMATCH; } if (to_validate->block_map_maximum_age != config->block_map_maximum_age) { *error_ptr = "Block map maximum age cannot change"; return VDO_PARAMETER_MISMATCH; } if (memcmp(&to_validate->thread_counts, &config->thread_counts, sizeof(struct thread_count_config)) != 0) { *error_ptr = "Thread configuration cannot change"; return VDO_PARAMETER_MISMATCH; } if (to_validate->physical_blocks < config->physical_blocks) { *error_ptr = "Removing physical storage from a VDO is not supported"; return VDO_NOT_IMPLEMENTED; } if (!may_grow && (to_validate->physical_blocks > config->physical_blocks)) { *error_ptr = "VDO physical size may not grow in current state"; return VDO_NOT_IMPLEMENTED; } return VDO_SUCCESS; } static int prepare_to_modify(struct dm_target *ti, struct device_config *config, struct vdo *vdo) { int result; bool may_grow = (vdo_get_admin_state(vdo) != VDO_ADMIN_STATE_PRE_LOADED); result = validate_new_device_config(config, vdo->device_config, may_grow, &ti->error); if (result != VDO_SUCCESS) return -EINVAL; if (config->logical_blocks > vdo->device_config->logical_blocks) { block_count_t logical_blocks = vdo->states.vdo.config.logical_blocks; vdo_log_info("Preparing to resize logical to %llu", (unsigned long long) config->logical_blocks); VDO_ASSERT_LOG_ONLY((config->logical_blocks > logical_blocks), "New logical size is larger than current size"); result = vdo_prepare_to_grow_block_map(vdo->block_map, config->logical_blocks); if (result != VDO_SUCCESS) { ti->error = "Device vdo_prepare_to_grow_logical failed"; return result; } vdo_log_info("Done preparing to resize logical"); } if (config->physical_blocks > vdo->device_config->physical_blocks) { result = prepare_to_grow_physical(vdo, config->physical_blocks); if (result != VDO_SUCCESS) { if (result == VDO_PARAMETER_MISMATCH) { /* * If we don't trap this case, vdo_status_to_errno() will remap * it to -EIO, which is misleading and ahistorical. */ result = -EINVAL; } if (result == VDO_TOO_MANY_SLABS) ti->error = "Device vdo_prepare_to_grow_physical failed (specified physical size too big based on formatted slab size)"; else ti->error = "Device vdo_prepare_to_grow_physical failed"; return result; } } if (strcmp(config->parent_device_name, vdo->device_config->parent_device_name) != 0) { const char *device_name = vdo_get_device_name(config->owning_target); vdo_log_info("Updating backing device of %s from %s to %s", device_name, vdo->device_config->parent_device_name, config->parent_device_name); } return VDO_SUCCESS; } static int update_existing_vdo(const char *device_name, struct dm_target *ti, unsigned int argc, char **argv, struct vdo *vdo) { int result; struct device_config *config; result = parse_device_config(argc, argv, ti, &config); if (result != VDO_SUCCESS) return -EINVAL; vdo_log_info("preparing to modify device '%s'", device_name); result = prepare_to_modify(ti, config, vdo); if (result != VDO_SUCCESS) { free_device_config(config); return vdo_status_to_errno(result); } set_device_config(ti, vdo, config); return VDO_SUCCESS; } static int vdo_ctr(struct dm_target *ti, unsigned int argc, char **argv) { int result; struct registered_thread allocating_thread, instance_thread; const char *device_name; struct vdo *vdo; vdo_register_allocating_thread(&allocating_thread, NULL); device_name = vdo_get_device_name(ti); vdo = vdo_find_matching(vdo_is_named, device_name); if (vdo == NULL) { result = construct_new_vdo(ti, argc, argv); } else { vdo_register_thread_device_id(&instance_thread, &vdo->instance); result = update_existing_vdo(device_name, ti, argc, argv, vdo); vdo_unregister_thread_device_id(); } vdo_unregister_allocating_thread(); return result; } static void vdo_dtr(struct dm_target *ti) { struct device_config *config = ti->private; struct vdo *vdo = vdo_forget(config->vdo); list_del_init(&config->config_list); if (list_empty(&vdo->device_config_list)) { const char *device_name; /* This was the last config referencing the VDO. Free it. */ unsigned int instance = vdo->instance; struct registered_thread allocating_thread, instance_thread; vdo_register_thread_device_id(&instance_thread, &instance); vdo_register_allocating_thread(&allocating_thread, NULL); device_name = vdo_get_device_name(ti); vdo_log_info("stopping device '%s'", device_name); if (vdo->dump_on_shutdown) vdo_dump_all(vdo, "device shutdown"); vdo_destroy(vdo_forget(vdo)); vdo_log_info("device '%s' stopped", device_name); vdo_unregister_thread_device_id(); vdo_unregister_allocating_thread(); release_instance(instance); } else if (config == vdo->device_config) { /* * The VDO still references this config. Give it a reference to a config that isn't * being destroyed. */ vdo->device_config = list_first_entry(&vdo->device_config_list, struct device_config, config_list); } free_device_config(config); ti->private = NULL; } static void vdo_presuspend(struct dm_target *ti) { get_vdo_for_target(ti)->suspend_type = (dm_noflush_suspending(ti) ? VDO_ADMIN_STATE_SUSPENDING : VDO_ADMIN_STATE_SAVING); } /** * write_super_block_for_suspend() - Update the VDO state and save the super block. * @completion: The admin completion */ static void write_super_block_for_suspend(struct vdo_completion *completion) { struct vdo *vdo = completion->vdo; switch (vdo_get_state(vdo)) { case VDO_DIRTY: case VDO_NEW: vdo_set_state(vdo, VDO_CLEAN); break; case VDO_CLEAN: case VDO_READ_ONLY_MODE: case VDO_FORCE_REBUILD: case VDO_RECOVERING: case VDO_REBUILD_FOR_UPGRADE: break; case VDO_REPLAYING: default: vdo_continue_completion(completion, UDS_BAD_STATE); return; } vdo_save_components(vdo, completion); } /** * suspend_callback() - Callback to initiate a suspend, registered in vdo_postsuspend(). * @completion: The sub-task completion. */ static void suspend_callback(struct vdo_completion *completion) { struct vdo *vdo = completion->vdo; struct admin_state *state = &vdo->admin.state; int result; assert_admin_phase_thread(vdo, __func__); switch (advance_phase(vdo)) { case SUSPEND_PHASE_START: if (vdo_get_admin_state_code(state)->quiescent) { /* Already suspended */ break; } vdo_continue_completion(completion, vdo_start_operation(state, vdo->suspend_type)); return; case SUSPEND_PHASE_PACKER: /* * If the VDO was already resumed from a prior suspend while read-only, some of the * components may not have been resumed. By setting a read-only error here, we * guarantee that the result of this suspend will be VDO_READ_ONLY and not * VDO_INVALID_ADMIN_STATE in that case. */ if (vdo_in_read_only_mode(vdo)) vdo_set_completion_result(completion, VDO_READ_ONLY); vdo_drain_packer(vdo->packer, completion); return; case SUSPEND_PHASE_DATA_VIOS: drain_data_vio_pool(vdo->data_vio_pool, completion); return; case SUSPEND_PHASE_DEDUPE: vdo_drain_hash_zones(vdo->hash_zones, completion); return; case SUSPEND_PHASE_FLUSHES: vdo_drain_flusher(vdo->flusher, completion); return; case SUSPEND_PHASE_LOGICAL_ZONES: /* * Attempt to flush all I/O before completing post suspend work. We believe a * suspended device is expected to have persisted all data written before the * suspend, even if it hasn't been flushed yet. */ result = vdo_synchronous_flush(vdo); if (result != VDO_SUCCESS) vdo_enter_read_only_mode(vdo, result); vdo_drain_logical_zones(vdo->logical_zones, vdo_get_admin_state_code(state), completion); return; case SUSPEND_PHASE_BLOCK_MAP: vdo_drain_block_map(vdo->block_map, vdo_get_admin_state_code(state), completion); return; case SUSPEND_PHASE_JOURNAL: vdo_drain_recovery_journal(vdo->recovery_journal, vdo_get_admin_state_code(state), completion); return; case SUSPEND_PHASE_DEPOT: vdo_drain_slab_depot(vdo->depot, vdo_get_admin_state_code(state), completion); return; case SUSPEND_PHASE_READ_ONLY_WAIT: vdo_wait_until_not_entering_read_only_mode(completion); return; case SUSPEND_PHASE_WRITE_SUPER_BLOCK: if (vdo_is_state_suspending(state) || (completion->result != VDO_SUCCESS)) { /* If we didn't save the VDO or there was an error, we're done. */ break; } write_super_block_for_suspend(completion); return; case SUSPEND_PHASE_END: break; default: vdo_set_completion_result(completion, UDS_BAD_STATE); } finish_operation_callback(completion); } static void vdo_postsuspend(struct dm_target *ti) { struct vdo *vdo = get_vdo_for_target(ti); struct registered_thread instance_thread; const char *device_name; int result; vdo_register_thread_device_id(&instance_thread, &vdo->instance); device_name = vdo_get_device_name(vdo->device_config->owning_target); vdo_log_info("suspending device '%s'", device_name); /* * It's important to note any error here does not actually stop device-mapper from * suspending the device. All this work is done post suspend. */ result = perform_admin_operation(vdo, SUSPEND_PHASE_START, suspend_callback, suspend_callback, "suspend"); if ((result == VDO_SUCCESS) || (result == VDO_READ_ONLY)) { /* * Treat VDO_READ_ONLY as a success since a read-only suspension still leaves the * VDO suspended. */ vdo_log_info("device '%s' suspended", device_name); } else if (result == VDO_INVALID_ADMIN_STATE) { vdo_log_error("Suspend invoked while in unexpected state: %s", vdo_get_admin_state(vdo)->name); } else { vdo_log_error_strerror(result, "Suspend of device '%s' failed", device_name); } vdo_unregister_thread_device_id(); } /** * was_new() - Check whether the vdo was new when it was loaded. * @vdo: The vdo to query. * * Return: true if the vdo was new. */ static bool was_new(const struct vdo *vdo) { return (vdo->load_state == VDO_NEW); } /** * requires_repair() - Check whether a vdo requires recovery or rebuild. * @vdo: The vdo to query. * * Return: true if the vdo must be repaired. */ static bool __must_check requires_repair(const struct vdo *vdo) { switch (vdo_get_state(vdo)) { case VDO_DIRTY: case VDO_FORCE_REBUILD: case VDO_REPLAYING: case VDO_REBUILD_FOR_UPGRADE: return true; default: return false; } } /** * get_load_type() - Determine how the slab depot was loaded. * @vdo: The vdo. * * Return: How the depot was loaded. */ static enum slab_depot_load_type get_load_type(struct vdo *vdo) { if (vdo_state_requires_read_only_rebuild(vdo->load_state)) return VDO_SLAB_DEPOT_REBUILD_LOAD; if (vdo_state_requires_recovery(vdo->load_state)) return VDO_SLAB_DEPOT_RECOVERY_LOAD; return VDO_SLAB_DEPOT_NORMAL_LOAD; } /** * load_callback() - Callback to do the destructive parts of loading a VDO. * @completion: The sub-task completion. */ static void load_callback(struct vdo_completion *completion) { struct vdo *vdo = completion->vdo; int result; assert_admin_phase_thread(vdo, __func__); switch (advance_phase(vdo)) { case LOAD_PHASE_START: result = vdo_start_operation(&vdo->admin.state, VDO_ADMIN_STATE_LOADING); if (result != VDO_SUCCESS) { vdo_continue_completion(completion, result); return; } /* Prepare the recovery journal for new entries. */ vdo_open_recovery_journal(vdo->recovery_journal, vdo->depot, vdo->block_map); vdo_allow_read_only_mode_entry(completion); return; case LOAD_PHASE_LOAD_DEPOT: vdo_set_dedupe_state_normal(vdo->hash_zones); if (vdo_is_read_only(vdo)) { /* * In read-only mode we don't use the allocator and it may not even be * readable, so don't bother trying to load it. */ vdo_set_completion_result(completion, VDO_READ_ONLY); break; } if (requires_repair(vdo)) { vdo_repair(completion); return; } vdo_load_slab_depot(vdo->depot, (was_new(vdo) ? VDO_ADMIN_STATE_FORMATTING : VDO_ADMIN_STATE_LOADING), completion, NULL); return; case LOAD_PHASE_MAKE_DIRTY: vdo_set_state(vdo, VDO_DIRTY); vdo_save_components(vdo, completion); return; case LOAD_PHASE_PREPARE_TO_ALLOCATE: vdo_initialize_block_map_from_journal(vdo->block_map, vdo->recovery_journal); vdo_prepare_slab_depot_to_allocate(vdo->depot, get_load_type(vdo), completion); return; case LOAD_PHASE_SCRUB_SLABS: if (vdo_state_requires_recovery(vdo->load_state)) vdo_enter_recovery_mode(vdo); vdo_scrub_all_unrecovered_slabs(vdo->depot, completion); return; case LOAD_PHASE_DATA_REDUCTION: WRITE_ONCE(vdo->compressing, vdo->device_config->compression); if (vdo->device_config->deduplication) { /* * Don't try to load or rebuild the index first (and log scary error * messages) if this is known to be a newly-formatted volume. */ vdo_start_dedupe_index(vdo->hash_zones, was_new(vdo)); } vdo->allocations_allowed = false; fallthrough; case LOAD_PHASE_FINISHED: break; case LOAD_PHASE_DRAIN_JOURNAL: vdo_drain_recovery_journal(vdo->recovery_journal, VDO_ADMIN_STATE_SAVING, completion); return; case LOAD_PHASE_WAIT_FOR_READ_ONLY: /* Avoid an infinite loop */ completion->error_handler = NULL; vdo->admin.phase = LOAD_PHASE_FINISHED; vdo_wait_until_not_entering_read_only_mode(completion); return; default: vdo_set_completion_result(completion, UDS_BAD_STATE); } finish_operation_callback(completion); } /** * handle_load_error() - Handle an error during the load operation. * @completion: The admin completion. * * If at all possible, brings the vdo online in read-only mode. This handler is registered in * vdo_preresume_registered(). */ static void handle_load_error(struct vdo_completion *completion) { struct vdo *vdo = completion->vdo; if (vdo_requeue_completion_if_needed(completion, vdo->thread_config.admin_thread)) return; if (vdo_state_requires_read_only_rebuild(vdo->load_state) && (vdo->admin.phase == LOAD_PHASE_MAKE_DIRTY)) { vdo_log_error_strerror(completion->result, "aborting load"); vdo->admin.phase = LOAD_PHASE_DRAIN_JOURNAL; load_callback(vdo_forget(completion)); return; } if ((completion->result == VDO_UNSUPPORTED_VERSION) && (vdo->admin.phase == LOAD_PHASE_MAKE_DIRTY)) { vdo_log_error("Aborting load due to unsupported version"); vdo->admin.phase = LOAD_PHASE_FINISHED; load_callback(completion); return; } vdo_log_error_strerror(completion->result, "Entering read-only mode due to load error"); vdo->admin.phase = LOAD_PHASE_WAIT_FOR_READ_ONLY; vdo_enter_read_only_mode(vdo, completion->result); completion->result = VDO_READ_ONLY; load_callback(completion); } /** * write_super_block_for_resume() - Update the VDO state and save the super block. * @completion: The admin completion */ static void write_super_block_for_resume(struct vdo_completion *completion) { struct vdo *vdo = completion->vdo; switch (vdo_get_state(vdo)) { case VDO_CLEAN: case VDO_NEW: vdo_set_state(vdo, VDO_DIRTY); vdo_save_components(vdo, completion); return; case VDO_DIRTY: case VDO_READ_ONLY_MODE: case VDO_FORCE_REBUILD: case VDO_RECOVERING: case VDO_REBUILD_FOR_UPGRADE: /* No need to write the super block in these cases */ vdo_launch_completion(completion); return; case VDO_REPLAYING: default: vdo_continue_completion(completion, UDS_BAD_STATE); } } /** * resume_callback() - Callback to resume a VDO. * @completion: The admin completion. */ static void resume_callback(struct vdo_completion *completion) { struct vdo *vdo = completion->vdo; int result; assert_admin_phase_thread(vdo, __func__); switch (advance_phase(vdo)) { case RESUME_PHASE_START: result = vdo_start_operation(&vdo->admin.state, VDO_ADMIN_STATE_RESUMING); if (result != VDO_SUCCESS) { vdo_continue_completion(completion, result); return; } write_super_block_for_resume(completion); return; case RESUME_PHASE_ALLOW_READ_ONLY_MODE: vdo_allow_read_only_mode_entry(completion); return; case RESUME_PHASE_DEDUPE: vdo_resume_hash_zones(vdo->hash_zones, completion); return; case RESUME_PHASE_DEPOT: vdo_resume_slab_depot(vdo->depot, completion); return; case RESUME_PHASE_JOURNAL: vdo_resume_recovery_journal(vdo->recovery_journal, completion); return; case RESUME_PHASE_BLOCK_MAP: vdo_resume_block_map(vdo->block_map, completion); return; case RESUME_PHASE_LOGICAL_ZONES: vdo_resume_logical_zones(vdo->logical_zones, completion); return; case RESUME_PHASE_PACKER: { bool was_enabled = vdo_get_compressing(vdo); bool enable = vdo->device_config->compression; if (enable != was_enabled) WRITE_ONCE(vdo->compressing, enable); vdo_log_info("compression is %s", (enable ? "enabled" : "disabled")); vdo_resume_packer(vdo->packer, completion); return; } case RESUME_PHASE_FLUSHER: vdo_resume_flusher(vdo->flusher, completion); return; case RESUME_PHASE_DATA_VIOS: resume_data_vio_pool(vdo->data_vio_pool, completion); return; case RESUME_PHASE_END: break; default: vdo_set_completion_result(completion, UDS_BAD_STATE); } finish_operation_callback(completion); } /** * grow_logical_callback() - Callback to initiate a grow logical. * @completion: The admin completion. * * Registered in perform_grow_logical(). */ static void grow_logical_callback(struct vdo_completion *completion) { struct vdo *vdo = completion->vdo; int result; assert_admin_phase_thread(vdo, __func__); switch (advance_phase(vdo)) { case GROW_LOGICAL_PHASE_START: if (vdo_is_read_only(vdo)) { vdo_log_error_strerror(VDO_READ_ONLY, "Can't grow logical size of a read-only VDO"); vdo_set_completion_result(completion, VDO_READ_ONLY); break; } result = vdo_start_operation(&vdo->admin.state, VDO_ADMIN_STATE_SUSPENDED_OPERATION); if (result != VDO_SUCCESS) { vdo_continue_completion(completion, result); return; } vdo->states.vdo.config.logical_blocks = vdo->block_map->next_entry_count; vdo_save_components(vdo, completion); return; case GROW_LOGICAL_PHASE_GROW_BLOCK_MAP: vdo_grow_block_map(vdo->block_map, completion); return; case GROW_LOGICAL_PHASE_END: break; case GROW_LOGICAL_PHASE_ERROR: vdo_enter_read_only_mode(vdo, completion->result); break; default: vdo_set_completion_result(completion, UDS_BAD_STATE); } finish_operation_callback(completion); } /** * handle_logical_growth_error() - Handle an error during the grow physical process. * @completion: The admin completion. */ static void handle_logical_growth_error(struct vdo_completion *completion) { struct vdo *vdo = completion->vdo; if (vdo->admin.phase == GROW_LOGICAL_PHASE_GROW_BLOCK_MAP) { /* * We've failed to write the new size in the super block, so set our in memory * config back to the old size. */ vdo->states.vdo.config.logical_blocks = vdo->block_map->entry_count; vdo_abandon_block_map_growth(vdo->block_map); } vdo->admin.phase = GROW_LOGICAL_PHASE_ERROR; grow_logical_callback(completion); } /** * perform_grow_logical() - Grow the logical size of the vdo. * @vdo: The vdo to grow. * @new_logical_blocks: The size to which the vdo should be grown. * * Context: This method may only be called when the vdo has been suspended and must not be called * from a base thread. * * Return: VDO_SUCCESS or an error. */ static int perform_grow_logical(struct vdo *vdo, block_count_t new_logical_blocks) { int result; if (vdo->device_config->logical_blocks == new_logical_blocks) { /* * A table was loaded for which we prepared to grow, but a table without that * growth was what we are resuming with. */ vdo_abandon_block_map_growth(vdo->block_map); return VDO_SUCCESS; } vdo_log_info("Resizing logical to %llu", (unsigned long long) new_logical_blocks); if (vdo->block_map->next_entry_count != new_logical_blocks) return VDO_PARAMETER_MISMATCH; result = perform_admin_operation(vdo, GROW_LOGICAL_PHASE_START, grow_logical_callback, handle_logical_growth_error, "grow logical"); if (result != VDO_SUCCESS) return result; vdo_log_info("Logical blocks now %llu", (unsigned long long) new_logical_blocks); return VDO_SUCCESS; } static void copy_callback(int read_err, unsigned long write_err, void *context) { struct vdo_completion *completion = context; int result = (((read_err == 0) && (write_err == 0)) ? VDO_SUCCESS : -EIO); vdo_continue_completion(completion, result); } static void partition_to_region(struct partition *partition, struct vdo *vdo, struct dm_io_region *region) { physical_block_number_t pbn = partition->offset - vdo->geometry.bio_offset; *region = (struct dm_io_region) { .bdev = vdo_get_backing_device(vdo), .sector = pbn * VDO_SECTORS_PER_BLOCK, .count = partition->count * VDO_SECTORS_PER_BLOCK, }; } /** * copy_partition() - Copy a partition from the location specified in the current layout to that in * the next layout. * @vdo: The vdo preparing to grow. * @id: The ID of the partition to copy. * @parent: The completion to notify when the copy is complete. */ static void copy_partition(struct vdo *vdo, enum partition_id id, struct vdo_completion *parent) { struct dm_io_region read_region, write_regions[1]; struct partition *from = vdo_get_known_partition(&vdo->layout, id); struct partition *to = vdo_get_known_partition(&vdo->next_layout, id); partition_to_region(from, vdo, &read_region); partition_to_region(to, vdo, &write_regions[0]); dm_kcopyd_copy(vdo->partition_copier, &read_region, 1, write_regions, 0, copy_callback, parent); } /** * grow_physical_callback() - Callback to initiate a grow physical. * @completion: The admin completion. * * Registered in perform_grow_physical(). */ static void grow_physical_callback(struct vdo_completion *completion) { struct vdo *vdo = completion->vdo; int result; assert_admin_phase_thread(vdo, __func__); switch (advance_phase(vdo)) { case GROW_PHYSICAL_PHASE_START: if (vdo_is_read_only(vdo)) { vdo_log_error_strerror(VDO_READ_ONLY, "Can't grow physical size of a read-only VDO"); vdo_set_completion_result(completion, VDO_READ_ONLY); break; } result = vdo_start_operation(&vdo->admin.state, VDO_ADMIN_STATE_SUSPENDED_OPERATION); if (result != VDO_SUCCESS) { vdo_continue_completion(completion, result); return; } /* Copy the journal into the new layout. */ copy_partition(vdo, VDO_RECOVERY_JOURNAL_PARTITION, completion); return; case GROW_PHYSICAL_PHASE_COPY_SUMMARY: copy_partition(vdo, VDO_SLAB_SUMMARY_PARTITION, completion); return; case GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS: vdo_uninitialize_layout(&vdo->layout); vdo->layout = vdo->next_layout; vdo_forget(vdo->next_layout.head); vdo->states.vdo.config.physical_blocks = vdo->layout.size; vdo_update_slab_depot_size(vdo->depot); vdo_save_components(vdo, completion); return; case GROW_PHYSICAL_PHASE_USE_NEW_SLABS: vdo_use_new_slabs(vdo->depot, completion); return; case GROW_PHYSICAL_PHASE_END: vdo->depot->summary_origin = vdo_get_known_partition(&vdo->layout, VDO_SLAB_SUMMARY_PARTITION)->offset; vdo->recovery_journal->origin = vdo_get_known_partition(&vdo->layout, VDO_RECOVERY_JOURNAL_PARTITION)->offset; break; case GROW_PHYSICAL_PHASE_ERROR: vdo_enter_read_only_mode(vdo, completion->result); break; default: vdo_set_completion_result(completion, UDS_BAD_STATE); } vdo_uninitialize_layout(&vdo->next_layout); finish_operation_callback(completion); } /** * handle_physical_growth_error() - Handle an error during the grow physical process. * @completion: The sub-task completion. */ static void handle_physical_growth_error(struct vdo_completion *completion) { completion->vdo->admin.phase = GROW_PHYSICAL_PHASE_ERROR; grow_physical_callback(completion); } /** * perform_grow_physical() - Grow the physical size of the vdo. * @vdo: The vdo to resize. * @new_physical_blocks: The new physical size in blocks. * * Context: This method may only be called when the vdo has been suspended and must not be called * from a base thread. * * Return: VDO_SUCCESS or an error. */ static int perform_grow_physical(struct vdo *vdo, block_count_t new_physical_blocks) { int result; block_count_t new_depot_size, prepared_depot_size; block_count_t old_physical_blocks = vdo->states.vdo.config.physical_blocks; /* Skip any noop grows. */ if (old_physical_blocks == new_physical_blocks) return VDO_SUCCESS; if (new_physical_blocks != vdo->next_layout.size) { /* * Either the VDO isn't prepared to grow, or it was prepared to grow to a different * size. Doing this check here relies on the fact that the call to this method is * done under the dmsetup message lock. */ vdo_uninitialize_layout(&vdo->next_layout); vdo_abandon_new_slabs(vdo->depot); return VDO_PARAMETER_MISMATCH; } /* Validate that we are prepared to grow appropriately. */ new_depot_size = vdo_get_known_partition(&vdo->next_layout, VDO_SLAB_DEPOT_PARTITION)->count; prepared_depot_size = (vdo->depot->new_slabs == NULL) ? 0 : vdo->depot->new_size; if (prepared_depot_size != new_depot_size) return VDO_PARAMETER_MISMATCH; result = perform_admin_operation(vdo, GROW_PHYSICAL_PHASE_START, grow_physical_callback, handle_physical_growth_error, "grow physical"); if (result != VDO_SUCCESS) return result; vdo_log_info("Physical block count was %llu, now %llu", (unsigned long long) old_physical_blocks, (unsigned long long) new_physical_blocks); return VDO_SUCCESS; } /** * apply_new_vdo_configuration() - Attempt to make any configuration changes from the table being * resumed. * @vdo: The vdo being resumed. * @config: The new device configuration derived from the table with which the vdo is being * resumed. * * Return: VDO_SUCCESS or an error. */ static int __must_check apply_new_vdo_configuration(struct vdo *vdo, struct device_config *config) { int result; result = perform_grow_logical(vdo, config->logical_blocks); if (result != VDO_SUCCESS) { vdo_log_error("grow logical operation failed, result = %d", result); return result; } result = perform_grow_physical(vdo, config->physical_blocks); if (result != VDO_SUCCESS) vdo_log_error("resize operation failed, result = %d", result); return result; } static int vdo_preresume_registered(struct dm_target *ti, struct vdo *vdo) { struct device_config *config = ti->private; const char *device_name = vdo_get_device_name(ti); block_count_t backing_blocks; int result; backing_blocks = get_underlying_device_block_count(vdo); if (backing_blocks < config->physical_blocks) { /* FIXME: can this still happen? */ vdo_log_error("resume of device '%s' failed: backing device has %llu blocks but VDO physical size is %llu blocks", device_name, (unsigned long long) backing_blocks, (unsigned long long) config->physical_blocks); return -EINVAL; } if (vdo_get_admin_state(vdo) == VDO_ADMIN_STATE_PRE_LOADED) { vdo_log_info("starting device '%s'", device_name); result = perform_admin_operation(vdo, LOAD_PHASE_START, load_callback, handle_load_error, "load"); if (result == VDO_UNSUPPORTED_VERSION) { /* * A component version is not supported. This can happen when the * recovery journal metadata is in an old version format. Abort the * load without saving the state. */ vdo->suspend_type = VDO_ADMIN_STATE_SUSPENDING; perform_admin_operation(vdo, SUSPEND_PHASE_START, suspend_callback, suspend_callback, "suspend"); return result; } if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) { /* * Something has gone very wrong. Make sure everything has drained and * leave the device in an unresumable state. */ vdo_log_error_strerror(result, "Start failed, could not load VDO metadata"); vdo->suspend_type = VDO_ADMIN_STATE_STOPPING; perform_admin_operation(vdo, SUSPEND_PHASE_START, suspend_callback, suspend_callback, "suspend"); return result; } /* Even if the VDO is read-only, it is now able to handle read requests. */ vdo_log_info("device '%s' started", device_name); } vdo_log_info("resuming device '%s'", device_name); /* If this fails, the VDO was not in a state to be resumed. This should never happen. */ result = apply_new_vdo_configuration(vdo, config); BUG_ON(result == VDO_INVALID_ADMIN_STATE); /* * Now that we've tried to modify the vdo, the new config *is* the config, whether the * modifications worked or not. */ vdo->device_config = config; /* * Any error here is highly unexpected and the state of the vdo is questionable, so we mark * it read-only in memory. Because we are suspended, the read-only state will not be * written to disk. */ if (result != VDO_SUCCESS) { vdo_log_error_strerror(result, "Commit of modifications to device '%s' failed", device_name); vdo_enter_read_only_mode(vdo, result); return result; } if (vdo_get_admin_state(vdo)->normal) { /* The VDO was just started, so we don't need to resume it. */ return VDO_SUCCESS; } result = perform_admin_operation(vdo, RESUME_PHASE_START, resume_callback, resume_callback, "resume"); BUG_ON(result == VDO_INVALID_ADMIN_STATE); if (result == VDO_READ_ONLY) { /* Even if the vdo is read-only, it has still resumed. */ result = VDO_SUCCESS; } if (result != VDO_SUCCESS) vdo_log_error("resume of device '%s' failed with error: %d", device_name, result); return result; } static int vdo_preresume(struct dm_target *ti) { struct registered_thread instance_thread; struct vdo *vdo = get_vdo_for_target(ti); int result; vdo_register_thread_device_id(&instance_thread, &vdo->instance); result = vdo_preresume_registered(ti, vdo); if ((result == VDO_PARAMETER_MISMATCH) || (result == VDO_INVALID_ADMIN_STATE) || (result == VDO_UNSUPPORTED_VERSION)) result = -EINVAL; vdo_unregister_thread_device_id(); return vdo_status_to_errno(result); } static void vdo_resume(struct dm_target *ti) { struct registered_thread instance_thread; vdo_register_thread_device_id(&instance_thread, &get_vdo_for_target(ti)->instance); vdo_log_info("device '%s' resumed", vdo_get_device_name(ti)); vdo_unregister_thread_device_id(); } /* * If anything changes that affects how user tools will interact with vdo, update the version * number and make sure documentation about the change is complete so tools can properly update * their management code. */ static struct target_type vdo_target_bio = { .features = DM_TARGET_SINGLETON, .name = "vdo", .version = { 9, 1, 0 }, .module = THIS_MODULE, .ctr = vdo_ctr, .dtr = vdo_dtr, .io_hints = vdo_io_hints, .iterate_devices = vdo_iterate_devices, .map = vdo_map_bio, .message = vdo_message, .status = vdo_status, .presuspend = vdo_presuspend, .postsuspend = vdo_postsuspend, .preresume = vdo_preresume, .resume = vdo_resume, }; static bool dm_registered; static void vdo_module_destroy(void) { vdo_log_debug("unloading"); if (dm_registered) dm_unregister_target(&vdo_target_bio); VDO_ASSERT_LOG_ONLY(instances.count == 0, "should have no instance numbers still in use, but have %u", instances.count); vdo_free(instances.words); memset(&instances, 0, sizeof(struct instance_tracker)); } static int __init vdo_init(void) { int result = 0; /* Memory tracking must be initialized first for accurate accounting. */ vdo_memory_init(); vdo_initialize_threads_mutex(); vdo_initialize_thread_device_registry(); vdo_initialize_device_registry_once(); /* Add VDO errors to the set of errors registered by the indexer. */ result = vdo_register_status_codes(); if (result != VDO_SUCCESS) { vdo_log_error("vdo_register_status_codes failed %d", result); vdo_module_destroy(); return result; } result = dm_register_target(&vdo_target_bio); if (result < 0) { vdo_log_error("dm_register_target failed %d", result); vdo_module_destroy(); return result; } dm_registered = true; return result; } static void __exit vdo_exit(void) { vdo_module_destroy(); /* Memory tracking cleanup must be done last. */ vdo_memory_exit(); } module_init(vdo_init); module_exit(vdo_exit); module_param_named(log_level, vdo_log_level, uint, 0644); MODULE_PARM_DESC(log_level, "Log level for log messages"); MODULE_DESCRIPTION(DM_NAME " target for transparent deduplication"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL");