From 93e72b3c612adcaca13d874fcc86c53e6c8da541 Mon Sep 17 00:00:00 2001 From: Philippe Liard Date: Mon, 1 Jun 2020 21:45:23 -0700 Subject: squashfs: migrate from ll_rw_block usage to BIO ll_rw_block() function has been deprecated in favor of BIO which appears to come with large performance improvements. This patch decreases boot time by close to 40% when using squashfs for the root file-system. This is observed at least in the context of starting an Android VM on Chrome OS using crosvm. The patch was tested on 4.19 as well as master. This patch is largely based on Adrien Schildknecht's patch that was originally sent as https://lkml.org/lkml/2017/9/22/814 though with some significant changes and simplifications while also taking Phillip Lougher's feedback into account, around preserving support for FILE_CACHE in particular. [akpm@linux-foundation.org: fix build error reported by Randy] Link: http://lkml.kernel.org/r/319997c2-5fc8-f889-2ea3-d913308a7c1f@infradead.org Signed-off-by: Philippe Liard Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Adrien Schildknecht Cc: Phillip Lougher Cc: Guenter Roeck Cc: Daniel Rosenberg Link: https://chromium.googlesource.com/chromiumos/platform/crosvm Link: http://lkml.kernel.org/r/20191106074238.186023-1-pliard@google.com Signed-off-by: Linus Torvalds --- fs/squashfs/block.c | 273 +++++++++++++++++--------------- fs/squashfs/decompressor.h | 5 +- fs/squashfs/decompressor_multi.c | 9 +- fs/squashfs/decompressor_multi_percpu.c | 17 +- fs/squashfs/decompressor_single.c | 9 +- fs/squashfs/lz4_wrapper.c | 17 +- fs/squashfs/lzo_wrapper.c | 17 +- fs/squashfs/squashfs.h | 4 +- fs/squashfs/xz_wrapper.c | 51 +++--- fs/squashfs/zlib_wrapper.c | 63 ++++---- fs/squashfs/zstd_wrapper.c | 64 ++++---- 11 files changed, 287 insertions(+), 242 deletions(-) diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 4f9b9fb59362..64f61330564a 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -13,6 +13,7 @@ * datablocks and metadata blocks. */ +#include #include #include #include @@ -27,44 +28,103 @@ #include "page_actor.h" /* - * Read the metadata block length, this is stored in the first two - * bytes of the metadata block. + * Returns the amount of bytes copied to the page actor. */ -static struct buffer_head *get_block_length(struct super_block *sb, - u64 *cur_index, int *offset, int *length) +static int copy_bio_to_actor(struct bio *bio, + struct squashfs_page_actor *actor, + int offset, int req_length) +{ + void *actor_addr = squashfs_first_page(actor); + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); + int copied_bytes = 0; + int actor_offset = 0; + + if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) + return 0; + + while (copied_bytes < req_length) { + int bytes_to_copy = min_t(int, bvec->bv_len - offset, + PAGE_SIZE - actor_offset); + + bytes_to_copy = min_t(int, bytes_to_copy, + req_length - copied_bytes); + memcpy(actor_addr + actor_offset, + page_address(bvec->bv_page) + bvec->bv_offset + offset, + bytes_to_copy); + + actor_offset += bytes_to_copy; + copied_bytes += bytes_to_copy; + offset += bytes_to_copy; + + if (actor_offset >= PAGE_SIZE) { + actor_addr = squashfs_next_page(actor); + if (!actor_addr) + break; + actor_offset = 0; + } + if (offset >= bvec->bv_len) { + if (!bio_next_segment(bio, &iter_all)) + break; + offset = 0; + } + } + squashfs_finish_page(actor); + return copied_bytes; +} + +static int squashfs_bio_read(struct super_block *sb, u64 index, int length, + struct bio **biop, int *block_offset) { struct squashfs_sb_info *msblk = sb->s_fs_info; - struct buffer_head *bh; - - bh = sb_bread(sb, *cur_index); - if (bh == NULL) - return NULL; - - if (msblk->devblksize - *offset == 1) { - *length = (unsigned char) bh->b_data[*offset]; - put_bh(bh); - bh = sb_bread(sb, ++(*cur_index)); - if (bh == NULL) - return NULL; - *length |= (unsigned char) bh->b_data[0] << 8; - *offset = 1; - } else { - *length = (unsigned char) bh->b_data[*offset] | - (unsigned char) bh->b_data[*offset + 1] << 8; - *offset += 2; - - if (*offset == msblk->devblksize) { - put_bh(bh); - bh = sb_bread(sb, ++(*cur_index)); - if (bh == NULL) - return NULL; - *offset = 0; + const u64 read_start = round_down(index, msblk->devblksize); + const sector_t block = read_start >> msblk->devblksize_log2; + const u64 read_end = round_up(index + length, msblk->devblksize); + const sector_t block_end = read_end >> msblk->devblksize_log2; + int offset = read_start - round_down(index, PAGE_SIZE); + int total_len = (block_end - block) << msblk->devblksize_log2; + const int page_count = DIV_ROUND_UP(total_len + offset, PAGE_SIZE); + int error, i; + struct bio *bio; + + bio = bio_alloc(GFP_NOIO, page_count); + if (!bio) + return -ENOMEM; + + bio_set_dev(bio, sb->s_bdev); + bio->bi_opf = READ; + bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT); + + for (i = 0; i < page_count; ++i) { + unsigned int len = + min_t(unsigned int, PAGE_SIZE - offset, total_len); + struct page *page = alloc_page(GFP_NOIO); + + if (!page) { + error = -ENOMEM; + goto out_free_bio; + } + if (!bio_add_page(bio, page, len, offset)) { + error = -EIO; + goto out_free_bio; } + offset = 0; + total_len -= len; } - return bh; -} + error = submit_bio_wait(bio); + if (error) + goto out_free_bio; + *biop = bio; + *block_offset = index & ((1 << msblk->devblksize_log2) - 1); + return 0; + +out_free_bio: + bio_free_pages(bio); + bio_put(bio); + return error; +} /* * Read and decompress a metadata block or datablock. Length is non-zero @@ -76,129 +136,88 @@ static struct buffer_head *get_block_length(struct super_block *sb, * algorithms). */ int squashfs_read_data(struct super_block *sb, u64 index, int length, - u64 *next_index, struct squashfs_page_actor *output) + u64 *next_index, struct squashfs_page_actor *output) { struct squashfs_sb_info *msblk = sb->s_fs_info; - struct buffer_head **bh; - int offset = index & ((1 << msblk->devblksize_log2) - 1); - u64 cur_index = index >> msblk->devblksize_log2; - int bytes, compressed, b = 0, k = 0, avail, i; - - bh = kcalloc(((output->length + msblk->devblksize - 1) - >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL); - if (bh == NULL) - return -ENOMEM; + struct bio *bio = NULL; + int compressed; + int res; + int offset; if (length) { /* * Datablock. */ - bytes = -offset; compressed = SQUASHFS_COMPRESSED_BLOCK(length); length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length); - if (next_index) - *next_index = index + length; - TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n", index, compressed ? "" : "un", length, output->length); - - if (length < 0 || length > output->length || - (index + length) > msblk->bytes_used) - goto read_failure; - - for (b = 0; bytes < length; b++, cur_index++) { - bh[b] = sb_getblk(sb, cur_index); - if (bh[b] == NULL) - goto block_release; - bytes += msblk->devblksize; - } - ll_rw_block(REQ_OP_READ, 0, b, bh); } else { /* * Metadata block. */ - if ((index + 2) > msblk->bytes_used) - goto read_failure; + const u8 *data; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); - bh[0] = get_block_length(sb, &cur_index, &offset, &length); - if (bh[0] == NULL) - goto read_failure; - b = 1; + if (index + 2 > msblk->bytes_used) { + res = -EIO; + goto out; + } + res = squashfs_bio_read(sb, index, 2, &bio, &offset); + if (res) + goto out; + + if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) { + res = -EIO; + goto out_free_bio; + } + /* Extract the length of the metadata block */ + data = page_address(bvec->bv_page) + bvec->bv_offset; + length = data[offset]; + if (offset <= bvec->bv_len - 1) { + length |= data[offset + 1] << 8; + } else { + if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) { + res = -EIO; + goto out_free_bio; + } + data = page_address(bvec->bv_page) + bvec->bv_offset; + length |= data[0] << 8; + } + bio_free_pages(bio); + bio_put(bio); - bytes = msblk->devblksize - offset; compressed = SQUASHFS_COMPRESSED(length); length = SQUASHFS_COMPRESSED_SIZE(length); - if (next_index) - *next_index = index + length + 2; + index += 2; TRACE("Block @ 0x%llx, %scompressed size %d\n", index, - compressed ? "" : "un", length); - - if (length < 0 || length > output->length || - (index + length) > msblk->bytes_used) - goto block_release; - - for (; bytes < length; b++) { - bh[b] = sb_getblk(sb, ++cur_index); - if (bh[b] == NULL) - goto block_release; - bytes += msblk->devblksize; - } - ll_rw_block(REQ_OP_READ, 0, b - 1, bh + 1); + compressed ? "" : "un", length); } + if (next_index) + *next_index = index + length; - for (i = 0; i < b; i++) { - wait_on_buffer(bh[i]); - if (!buffer_uptodate(bh[i])) - goto block_release; - } + res = squashfs_bio_read(sb, index, length, &bio, &offset); + if (res) + goto out; if (compressed) { - if (!msblk->stream) - goto read_failure; - length = squashfs_decompress(msblk, bh, b, offset, length, - output); - if (length < 0) - goto read_failure; - } else { - /* - * Block is uncompressed. - */ - int in, pg_offset = 0; - void *data = squashfs_first_page(output); - - for (bytes = length; k < b; k++) { - in = min(bytes, msblk->devblksize - offset); - bytes -= in; - while (in) { - if (pg_offset == PAGE_SIZE) { - data = squashfs_next_page(output); - pg_offset = 0; - } - avail = min_t(int, in, PAGE_SIZE - - pg_offset); - memcpy(data + pg_offset, bh[k]->b_data + offset, - avail); - in -= avail; - pg_offset += avail; - offset += avail; - } - offset = 0; - put_bh(bh[k]); + if (!msblk->stream) { + res = -EIO; + goto out_free_bio; } - squashfs_finish_page(output); + res = squashfs_decompress(msblk, bio, offset, length, output); + } else { + res = copy_bio_to_actor(bio, output, offset, length); } - kfree(bh); - return length; - -block_release: - for (; k < b; k++) - put_bh(bh[k]); +out_free_bio: + bio_free_pages(bio); + bio_put(bio); +out: + if (res < 0) + ERROR("Failed to read block 0x%llx: %d\n", index, res); -read_failure: - ERROR("squashfs_read_data failed to read block 0x%llx\n", - (unsigned long long) index); - kfree(bh); - return -EIO; + return res; } diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index ec8617523e56..1b9ccfd0aa51 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -10,13 +10,14 @@ * decompressor.h */ +#include + struct squashfs_decompressor { void *(*init)(struct squashfs_sb_info *, void *); void *(*comp_opts)(struct squashfs_sb_info *, void *, int); void (*free)(void *); int (*decompress)(struct squashfs_sb_info *, void *, - struct buffer_head **, int, int, int, - struct squashfs_page_actor *); + struct bio *, int, int, struct squashfs_page_actor *); int id; char *name; int supported; diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c index c181dee235bb..db9f12a3ea05 100644 --- a/fs/squashfs/decompressor_multi.c +++ b/fs/squashfs/decompressor_multi.c @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include @@ -180,14 +180,15 @@ wait: } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, - int b, int offset, int length, struct squashfs_page_actor *output) +int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, + int offset, int length, + struct squashfs_page_actor *output) { int res; struct squashfs_stream *stream = msblk->stream; struct decomp_stream *decomp_stream = get_decomp_stream(msblk, stream); res = msblk->decompressor->decompress(msblk, decomp_stream->stream, - bh, b, offset, length, output); + bio, offset, length, output); put_decomp_stream(decomp_stream, stream); if (res < 0) ERROR("%s decompression failed, data probably corrupt\n", diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c index 2a2a2d106440..d93e12d9b712 100644 --- a/fs/squashfs/decompressor_multi_percpu.c +++ b/fs/squashfs/decompressor_multi_percpu.c @@ -72,14 +72,17 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, - int b, int offset, int length, struct squashfs_page_actor *output) +int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, + int offset, int length, struct squashfs_page_actor *output) { - struct squashfs_stream __percpu *percpu = - (struct squashfs_stream __percpu *) msblk->stream; - struct squashfs_stream *stream = get_cpu_ptr(percpu); - int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, - offset, length, output); + struct squashfs_stream __percpu *percpu; + struct squashfs_stream *stream; + int res; + + percpu = (struct squashfs_stream __percpu *)msblk->stream; + stream = get_cpu_ptr(percpu); + res = msblk->decompressor->decompress(msblk, stream->stream, bio, + offset, length, output); put_cpu_ptr(stream); if (res < 0) diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c index 550c3e592032..4eb3d083d45e 100644 --- a/fs/squashfs/decompressor_single.c +++ b/fs/squashfs/decompressor_single.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -59,14 +59,15 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, - int b, int offset, int length, struct squashfs_page_actor *output) +int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, + int offset, int length, + struct squashfs_page_actor *output) { int res; struct squashfs_stream *stream = msblk->stream; mutex_lock(&stream->mutex); - res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, + res = msblk->decompressor->decompress(msblk, stream->stream, bio, offset, length, output); mutex_unlock(&stream->mutex); diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c index c4e47e0588c7..233d5582fbee 100644 --- a/fs/squashfs/lz4_wrapper.c +++ b/fs/squashfs/lz4_wrapper.c @@ -4,7 +4,7 @@ * Phillip Lougher */ -#include +#include #include #include #include @@ -89,20 +89,23 @@ static void lz4_free(void *strm) static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); struct squashfs_lz4 *stream = strm; void *buff = stream->input, *data; - int avail, i, bytes = length, res; + int bytes = length, res; - for (i = 0; i < b; i++) { - avail = min(bytes, msblk->devblksize - offset); - memcpy(buff, bh[i]->b_data + offset, avail); + while (bio_next_segment(bio, &iter_all)) { + int avail = min(bytes, ((int)bvec->bv_len) - offset); + + data = page_address(bvec->bv_page) + bvec->bv_offset; + memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; offset = 0; - put_bh(bh[i]); } res = LZ4_decompress_safe(stream->input, stream->output, diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index aa3c3dafc33d..97bb7d92ddcd 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -9,7 +9,7 @@ */ #include -#include +#include #include #include #include @@ -63,21 +63,24 @@ static void lzo_free(void *strm) static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); struct squashfs_lzo *stream = strm; void *buff = stream->input, *data; - int avail, i, bytes = length, res; + int bytes = length, res; size_t out_len = output->length; - for (i = 0; i < b; i++) { - avail = min(bytes, msblk->devblksize - offset); - memcpy(buff, bh[i]->b_data + offset, avail); + while (bio_next_segment(bio, &iter_all)) { + int avail = min(bytes, ((int)bvec->bv_len) - offset); + + data = page_address(bvec->bv_page) + bvec->bv_offset; + memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; offset = 0; - put_bh(bh[i]); } res = lzo1x_decompress_safe(stream->input, (size_t)length, diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 2797763ed046..9783e01c8100 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -40,8 +40,8 @@ extern void *squashfs_decompressor_setup(struct super_block *, unsigned short); /* decompressor_xxx.c */ extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *); extern void squashfs_decompressor_destroy(struct squashfs_sb_info *); -extern int squashfs_decompress(struct squashfs_sb_info *, struct buffer_head **, - int, int, int, struct squashfs_page_actor *); +extern int squashfs_decompress(struct squashfs_sb_info *, struct bio *, + int, int, struct squashfs_page_actor *); extern int squashfs_max_decompressors(void); /* export.c */ diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index 4b2f2051a6dc..e80419aed862 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -10,7 +10,7 @@ #include -#include +#include #include #include #include @@ -117,11 +117,12 @@ static void squashfs_xz_free(void *strm) static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { - enum xz_ret xz_err; - int avail, total = 0, k = 0; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); + int total = 0, error = 0; struct squashfs_xz *stream = strm; xz_dec_reset(stream->state); @@ -131,11 +132,23 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, stream->buf.out_size = PAGE_SIZE; stream->buf.out = squashfs_first_page(output); - do { - if (stream->buf.in_pos == stream->buf.in_size && k < b) { - avail = min(length, msblk->devblksize - offset); + for (;;) { + enum xz_ret xz_err; + + if (stream->buf.in_pos == stream->buf.in_size) { + const void *data; + int avail; + + if (!bio_next_segment(bio, &iter_all)) { + /* XZ_STREAM_END must be reached. */ + error = -EIO; + break; + } + + avail = min(length, ((int)bvec->bv_len) - offset); + data = page_address(bvec->bv_page) + bvec->bv_offset; length -= avail; - stream->buf.in = bh[k]->b_data + offset; + stream->buf.in = data + offset; stream->buf.in_size = avail; stream->buf.in_pos = 0; offset = 0; @@ -150,23 +163,17 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, } xz_err = xz_dec_run(stream->state, &stream->buf); - - if (stream->buf.in_pos == stream->buf.in_size && k < b) - put_bh(bh[k++]); - } while (xz_err == XZ_OK); + if (xz_err == XZ_STREAM_END) + break; + if (xz_err != XZ_OK) { + error = -EIO; + break; + } + } squashfs_finish_page(output); - if (xz_err != XZ_STREAM_END || k < b) - goto out; - - return total + stream->buf.out_pos; - -out: - for (; k < b; k++) - put_bh(bh[k]); - - return -EIO; + return error ? error : total + stream->buf.out_pos; } const struct squashfs_decompressor squashfs_xz_comp_ops = { diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index f2226afa1625..bcb881ec47f2 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -10,7 +10,7 @@ #include -#include +#include #include #include #include @@ -50,21 +50,35 @@ static void zlib_free(void *strm) static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { - int zlib_err, zlib_init = 0, k = 0; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); + int zlib_init = 0, error = 0; z_stream *stream = strm; stream->avail_out = PAGE_SIZE; stream->next_out = squashfs_first_page(output); stream->avail_in = 0; - do { - if (stream->avail_in == 0 && k < b) { - int avail = min(length, msblk->devblksize - offset); + for (;;) { + int zlib_err; + + if (stream->avail_in == 0) { + const void *data; + int avail; + + if (!bio_next_segment(bio, &iter_all)) { + /* Z_STREAM_END must be reached. */ + error = -EIO; + break; + } + + avail = min(length, ((int)bvec->bv_len) - offset); + data = page_address(bvec->bv_page) + bvec->bv_offset; length -= avail; - stream->next_in = bh[k]->b_data + offset; + stream->next_in = data + offset; stream->avail_in = avail; offset = 0; } @@ -78,37 +92,28 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, if (!zlib_init) { zlib_err = zlib_inflateInit(stream); if (zlib_err != Z_OK) { - squashfs_finish_page(output); - goto out; + error = -EIO; + break; } zlib_init = 1; } zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH); - - if (stream->avail_in == 0 && k < b) - put_bh(bh[k++]); - } while (zlib_err == Z_OK); + if (zlib_err == Z_STREAM_END) + break; + if (zlib_err != Z_OK) { + error = -EIO; + break; + } + } squashfs_finish_page(output); - if (zlib_err != Z_STREAM_END) - goto out; - - zlib_err = zlib_inflateEnd(stream); - if (zlib_err != Z_OK) - goto out; - - if (k < b) - goto out; - - return stream->total_out; - -out: - for (; k < b; k++) - put_bh(bh[k]); + if (!error) + if (zlib_inflateEnd(stream) != Z_OK) + error = -EIO; - return -EIO; + return error ? error : stream->total_out; } const struct squashfs_decompressor squashfs_zlib_comp_ops = { diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c index b448c2a1d0ed..b7cb1faa652d 100644 --- a/fs/squashfs/zstd_wrapper.c +++ b/fs/squashfs/zstd_wrapper.c @@ -9,7 +9,7 @@ */ #include -#include +#include #include #include #include @@ -59,33 +59,44 @@ static void zstd_free(void *strm) static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { struct workspace *wksp = strm; ZSTD_DStream *stream; size_t total_out = 0; - size_t zstd_err; - int k = 0; + int error = 0; ZSTD_inBuffer in_buf = { NULL, 0, 0 }; ZSTD_outBuffer out_buf = { NULL, 0, 0 }; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); stream = ZSTD_initDStream(wksp->window_size, wksp->mem, wksp->mem_size); if (!stream) { ERROR("Failed to initialize zstd decompressor\n"); - goto out; + return -EIO; } out_buf.size = PAGE_SIZE; out_buf.dst = squashfs_first_page(output); - do { - if (in_buf.pos == in_buf.size && k < b) { - int avail = min(length, msblk->devblksize - offset); + for (;;) { + size_t zstd_err; + if (in_buf.pos == in_buf.size) { + const void *data; + int avail; + + if (!bio_next_segment(bio, &iter_all)) { + error = -EIO; + break; + } + + avail = min(length, ((int)bvec->bv_len) - offset); + data = page_address(bvec->bv_page) + bvec->bv_offset; length -= avail; - in_buf.src = bh[k]->b_data + offset; + in_buf.src = data + offset; in_buf.size = avail; in_buf.pos = 0; offset = 0; @@ -97,8 +108,8 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, /* Shouldn't run out of pages * before stream is done. */ - squashfs_finish_page(output); - goto out; + error = -EIO; + break; } out_buf.pos = 0; out_buf.size = PAGE_SIZE; @@ -107,29 +118,20 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, total_out -= out_buf.pos; zstd_err = ZSTD_decompressStream(stream, &out_buf, &in_buf); total_out += out_buf.pos; /* add the additional data produced */ - - if (in_buf.pos == in_buf.size && k < b) - put_bh(bh[k++]); - } while (zstd_err != 0 && !ZSTD_isError(zstd_err)); - - squashfs_finish_page(output); - - if (ZSTD_isError(zstd_err)) { - ERROR("zstd decompression error: %d\n", - (int)ZSTD_getErrorCode(zstd_err)); - goto out; + if (zstd_err == 0) + break; + + if (ZSTD_isError(zstd_err)) { + ERROR("zstd decompression error: %d\n", + (int)ZSTD_getErrorCode(zstd_err)); + error = -EIO; + break; + } } - if (k < b) - goto out; - - return (int)total_out; - -out: - for (; k < b; k++) - put_bh(bh[k]); + squashfs_finish_page(output); - return -EIO; + return error ? error : total_out; } const struct squashfs_decompressor squashfs_zstd_comp_ops = { -- cgit v1.2.3 From 8f745e62a1926e57a671b0841241b60e80903dda Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Mon, 1 Jun 2020 21:45:26 -0700 Subject: ocfs2: add missing annotation for dlm_empty_lockres() Sparse reports a warning at dlm_empty_lockres() warning: context imbalance in dlm_purge_lockres() - unexpected unlock The root cause is the missing annotation at dlm_purge_lockres() Add the missing __must_hold(&dlm->spinlock) Signed-off-by: Jules Irenge Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Link: http://lkml.kernel.org/r/20200403160505.2832-4-jbi.octave@gmail.com Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmmaster.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 55a6512e9fde..f105746063ed 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2760,6 +2760,7 @@ leave: * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) + __must_hold(&dlm->spinlock) { int ret; int lock_dropped = 0; -- cgit v1.2.3 From 912f655d78c5d4ad05eac287f23a435924df7144 Mon Sep 17 00:00:00 2001 From: Gang He Date: Mon, 1 Jun 2020 21:45:29 -0700 Subject: ocfs2: mount shared volume without ha stack Usually we create and use a ocfs2 shared volume on the top of ha stack. For pcmk based ha stack, which includes DLM, corosync and pacemaker services. The customers complained they could not mount existent ocfs2 volume in the single node without ha stack, e.g. single node backup/restore scenario. Like this case, the customers just want to access the data from the existent ocfs2 volume quickly, but do not want to restart or setup ha stack. Then, I'd like to add a mount option "nocluster", if the users use this option to mount a ocfs2 shared volume, the whole mount will not depend on the ha related services. the command will mount the existent ocfs2 volume directly (like local mount), for avoiding setup the ha stack. Signed-off-by: Gang He Signed-off-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Jun Piao Link: http://lkml.kernel.org/r/20200423053300.22661-1-ghe@suse.com Signed-off-by: Linus Torvalds --- fs/ocfs2/ocfs2.h | 4 +++- fs/ocfs2/slot_map.c | 46 +++++++++++++++++++++++++++------------------- fs/ocfs2/super.c | 21 +++++++++++++++++++++ 3 files changed, 51 insertions(+), 20 deletions(-) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 9150cfa4df7d..ee5d98516212 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -279,6 +279,7 @@ enum ocfs2_mount_options OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ + OCFS2_MOUNT_NOCLUSTER = 1 << 18, /* No cluster aware filesystem mount */ }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -673,7 +674,8 @@ static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb) static inline int ocfs2_mount_local(struct ocfs2_super *osb) { - return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); + return ((osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT) + || (osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER)); } static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 8caeceeaeda7..4da0e4b1e79b 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -254,14 +254,16 @@ static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, int i, ret = -ENOSPC; if ((preferred >= 0) && (preferred < si->si_num_slots)) { - if (!si->si_slots[preferred].sl_valid) { + if (!si->si_slots[preferred].sl_valid || + !si->si_slots[preferred].sl_node_num) { ret = preferred; goto out; } } for(i = 0; i < si->si_num_slots; i++) { - if (!si->si_slots[i].sl_valid) { + if (!si->si_slots[i].sl_valid || + !si->si_slots[i].sl_node_num) { ret = i; break; } @@ -456,24 +458,30 @@ int ocfs2_find_slot(struct ocfs2_super *osb) spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); - /* search for ourselves first and take the slot if it already - * exists. Perhaps we need to mark this in a variable for our - * own journal recovery? Possibly not, though we certainly - * need to warn to the user */ - slot = __ocfs2_node_num_to_slot(si, osb->node_num); - if (slot < 0) { - /* if no slot yet, then just take 1st available - * one. */ - slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); + if (ocfs2_mount_local(osb)) + /* use slot 0 directly in local mode */ + slot = 0; + else { + /* search for ourselves first and take the slot if it already + * exists. Perhaps we need to mark this in a variable for our + * own journal recovery? Possibly not, though we certainly + * need to warn to the user */ + slot = __ocfs2_node_num_to_slot(si, osb->node_num); if (slot < 0) { - spin_unlock(&osb->osb_lock); - mlog(ML_ERROR, "no free slots available!\n"); - status = -EINVAL; - goto bail; - } - } else - printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already " - "allocated to this node!\n", slot, osb->dev_str); + /* if no slot yet, then just take 1st available + * one. */ + slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); + if (slot < 0) { + spin_unlock(&osb->osb_lock); + mlog(ML_ERROR, "no free slots available!\n"); + status = -EINVAL; + goto bail; + } + } else + printk(KERN_INFO "ocfs2: Slot %d on device (%s) was " + "already allocated to this node!\n", + slot, osb->dev_str); + } ocfs2_set_slot(si, slot, osb->node_num); osb->slot_num = slot; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index ac61eeaf3837..71ea9ce71a6b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -175,6 +175,7 @@ enum { Opt_dir_resv_level, Opt_journal_async_commit, Opt_err_cont, + Opt_nocluster, Opt_err, }; @@ -208,6 +209,7 @@ static const match_table_t tokens = { {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_journal_async_commit, "journal_async_commit"}, {Opt_err_cont, "errors=continue"}, + {Opt_nocluster, "nocluster"}, {Opt_err, NULL} }; @@ -619,6 +621,13 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto out; } + tmp = OCFS2_MOUNT_NOCLUSTER; + if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { + ret = -EINVAL; + mlog(ML_ERROR, "Cannot change nocluster option on remount\n"); + goto out; + } + tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | OCFS2_MOUNT_HB_NONE; if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { @@ -859,6 +868,7 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, } if (ocfs2_userspace_stack(osb) && + !(osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && strncmp(osb->osb_cluster_stack, mopt->cluster_stack, OCFS2_STACK_LABEL_LEN)) { mlog(ML_ERROR, @@ -1139,6 +1149,11 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : "ordered"); + if ((osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && + !(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT)) + printk(KERN_NOTICE "ocfs2: The shared device (%s) is mounted " + "without cluster aware mode.\n", osb->dev_str); + atomic_set(&osb->vol_state, VOLUME_MOUNTED); wake_up(&osb->osb_mount_event); @@ -1445,6 +1460,9 @@ static int ocfs2_parse_options(struct super_block *sb, case Opt_journal_async_commit: mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT; break; + case Opt_nocluster: + mopt->mount_opt |= OCFS2_MOUNT_NOCLUSTER; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1556,6 +1574,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) seq_printf(s, ",journal_async_commit"); + if (opts & OCFS2_MOUNT_NOCLUSTER) + seq_printf(s, ",nocluster"); + return 0; } -- cgit v1.2.3 From 78128fabd022240852859c0b253972147593690b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 1 Jun 2020 21:45:33 -0700 Subject: arch/parisc/include/asm/pgtable.h: remove unused `old_pte' parisc's set_pte_at() macro has set-but-not-used variable: include/linux/pgtable.h: In function 'pte_clear_not_present_full': arch/parisc/include/asm/pgtable.h:96:9: warning: variable 'old_pte' set but not used [-Wunused-but-set-variable] Reported-by: kbuild test robot Signed-off-by: Andrew Morton Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Mike Rapoport Signed-off-by: Linus Torvalds --- arch/parisc/include/asm/pgtable.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 9832c73a7021..cd7df48dc874 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -93,10 +93,8 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) #define set_pte_at(mm, addr, ptep, pteval) \ do { \ - pte_t old_pte; \ unsigned long flags; \ spin_lock_irqsave(pgd_spinlock((mm)->pgd), flags);\ - old_pte = *ptep; \ set_pte(ptep, pteval); \ purge_tlb_entries(mm, addr); \ spin_unlock_irqrestore(pgd_spinlock((mm)->pgd), flags);\ -- cgit v1.2.3 From 735e4ae5ba28c886d249ad04d3c8cc097dad6336 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 1 Jun 2020 21:45:36 -0700 Subject: vfs: track per-sb writeback errors and report them to syncfs Patch series "vfs: have syncfs() return error when there are writeback errors", v6. Currently, syncfs does not return errors when one of the inodes fails to be written back. It will return errors based on the legacy AS_EIO and AS_ENOSPC flags when syncing out the block device fails, but that's not particularly helpful for filesystems that aren't backed by a blockdev. It's also possible for a stray sync to lose those errors. The basic idea in this set is to track writeback errors at the superblock level, so that we can quickly and easily check whether something bad happened without having to fsync each file individually. syncfs is then changed to reliably report writeback errors after they occur, much in the same fashion as fsync does now. This patch (of 2): Usually we suggest that applications call fsync when they want to ensure that all data written to the file has made it to the backing store, but that can be inefficient when there are a lot of open files. Calling syncfs on the filesystem can be more efficient in some situations, but the error reporting doesn't currently work the way most people expect. If a single inode on a filesystem reports a writeback error, syncfs won't necessarily return an error. syncfs only returns an error if __sync_blockdev fails, and on some filesystems that's a no-op. It would be better if syncfs reported an error if there were any writeback failures. Then applications could call syncfs to see if there are any errors on any open files, and could then call fsync on all of the other descriptors to figure out which one failed. This patch adds a new errseq_t to struct super_block, and has mapping_set_error also record writeback errors there. To report those errors, we also need to keep an errseq_t in struct file to act as a cursor. This patch adds a dedicated field for that purpose, which slots nicely into 4 bytes of padding at the end of struct file on x86_64. An earlier version of this patch used an O_PATH file descriptor to cue the kernel that the open file should track the superblock error and not the inode's writeback error. I think that API is just too weird though. This is simpler and should make syncfs error reporting "just work" even if someone is multiplexing fsync and syncfs on the same fds. Signed-off-by: Jeff Layton Signed-off-by: Andrew Morton Reviewed-by: Jan Kara Cc: Andres Freund Cc: Matthew Wilcox Cc: Al Viro Cc: Christoph Hellwig Cc: Dave Chinner Cc: David Howells Link: http://lkml.kernel.org/r/20200428135155.19223-1-jlayton@kernel.org Link: http://lkml.kernel.org/r/20200428135155.19223-2-jlayton@kernel.org Signed-off-by: Linus Torvalds --- drivers/dax/device.c | 1 + fs/file_table.c | 1 + fs/open.c | 3 +-- fs/sync.c | 6 ++++-- include/linux/fs.h | 16 ++++++++++++++++ include/linux/pagemap.h | 5 ++++- 6 files changed, 27 insertions(+), 5 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 1af823b2fe6b..4c0af2eb7e19 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -377,6 +377,7 @@ static int dax_open(struct inode *inode, struct file *filp) inode->i_mapping->a_ops = &dev_dax_aops; filp->f_mapping = inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); + filp->f_sb_err = file_sample_sb_err(filp); filp->private_data = dev_dax; inode->i_flags = S_DAX; diff --git a/fs/file_table.c b/fs/file_table.c index 30d55c9a1744..676e620948d2 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -198,6 +198,7 @@ static struct file *alloc_file(const struct path *path, int flags, file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); + file->f_sb_err = file_sample_sb_err(file); if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) file->f_mode |= FMODE_CAN_READ; diff --git a/fs/open.c b/fs/open.c index 719b320ede52..d9467a8a7f6a 100644 --- a/fs/open.c +++ b/fs/open.c @@ -743,9 +743,8 @@ static int do_dentry_open(struct file *f, path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; - - /* Ensure that we skip any errors that predate opening of the file */ f->f_wb_err = filemap_sample_wb_err(f->f_mapping); + f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH | FMODE_OPENED; diff --git a/fs/sync.c b/fs/sync.c index 4d1ff010bc5a..c6f6f5be5682 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -161,7 +161,7 @@ SYSCALL_DEFINE1(syncfs, int, fd) { struct fd f = fdget(fd); struct super_block *sb; - int ret; + int ret, ret2; if (!f.file) return -EBADF; @@ -171,8 +171,10 @@ SYSCALL_DEFINE1(syncfs, int, fd) ret = sync_filesystem(sb); up_read(&sb->s_umount); + ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err); + fdput(f); - return ret; + return ret ? ret : ret2; } /** diff --git a/include/linux/fs.h b/include/linux/fs.h index 45cc10cdf6dd..f2fb5b7406b9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -976,6 +976,7 @@ struct file { #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; errseq_t f_wb_err; + errseq_t f_sb_err; /* for syncfs */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ @@ -1520,6 +1521,9 @@ struct super_block { /* Being remounted read-only */ int s_readonly_remount; + /* per-sb errseq_t for reporting writeback errors via syncfs */ + errseq_t s_wb_err; + /* AIO completions deferred from interrupt context */ struct workqueue_struct *s_dio_done_wq; struct hlist_head s_pins; @@ -2827,6 +2831,18 @@ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) return errseq_sample(&mapping->wb_err); } +/** + * file_sample_sb_err - sample the current errseq_t to test for later errors + * @mapping: mapping to be sampled + * + * Grab the most current superblock-level errseq_t value for the given + * struct file. + */ +static inline errseq_t file_sample_sb_err(struct file *file) +{ + return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err); +} + static inline int filemap_nr_thps(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index a8f7bd8ea1c6..d4409b13747e 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -51,7 +51,10 @@ static inline void mapping_set_error(struct address_space *mapping, int error) return; /* Record in wb_err for checkers using errseq_t based tracking */ - filemap_set_wb_err(mapping, error); + __filemap_set_wb_err(mapping, error); + + /* Record it in superblock */ + errseq_set(&mapping->host->i_sb->s_wb_err, error); /* Record it in flags for now, for legacy callers */ if (error == -ENOSPC) -- cgit v1.2.3 From 485e9605c05733759d3bd5aba4fbe561801f3658 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 1 Jun 2020 21:45:40 -0700 Subject: fs/buffer.c: record blockdev write errors in super_block that it backs When syncing out a block device (a'la __sync_blockdev), any error encountered will only be recorded in the bd_inode's mapping. When the blockdev contains a filesystem however, we'd like to also record the error in the super_block that's stored there. Make mark_buffer_write_io_error also record the error in the corresponding super_block when a writeback error occurs and the block device contains a mounted superblock. Since superblocks are RCU freed, hold the rcu_read_lock to ensure that the superblock doesn't go away while we're marking it. Signed-off-by: Jeff Layton Signed-off-by: Andrew Morton Reviewed-by: Jan Kara Cc: Al Viro Cc: Andres Freund Cc: Matthew Wilcox Cc: David Howells Cc: Christoph Hellwig Cc: Dave Chinner Link: http://lkml.kernel.org/r/20200428135155.19223-3-jlayton@kernel.org Signed-off-by: Linus Torvalds --- fs/buffer.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/buffer.c b/fs/buffer.c index a60f60396cfa..15f25170615a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1154,12 +1154,19 @@ EXPORT_SYMBOL(mark_buffer_dirty); void mark_buffer_write_io_error(struct buffer_head *bh) { + struct super_block *sb; + set_buffer_write_io_error(bh); /* FIXME: do we need to set this in both places? */ if (bh->b_page && bh->b_page->mapping) mapping_set_error(bh->b_page->mapping, -EIO); if (bh->b_assoc_map) mapping_set_error(bh->b_assoc_map, -EIO); + rcu_read_lock(); + sb = READ_ONCE(bh->b_bdev->bd_super); + if (sb) + errseq_set(&sb->s_wb_err, -EIO); + rcu_read_unlock(); } EXPORT_SYMBOL(mark_buffer_write_io_error); -- cgit v1.2.3 From 49f2d2419d60a103752e5fbaf158cf8d07c0d884 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 1 Jun 2020 21:45:43 -0700 Subject: usercopy: mark dma-kmalloc caches as usercopy caches We have seen a "usercopy: Kernel memory overwrite attempt detected to SLUB object 'dma-kmalloc-1 k' (offset 0, size 11)!" error on s390x, as IUCV uses kmalloc() with __GFP_DMA because of memory address restrictions. The issue has been discussed [2] and it has been noted that if all the kmalloc caches are marked as usercopy, there's little reason not to mark dma-kmalloc caches too. The 'dma' part merely means that __GFP_DMA is used to restrict memory address range. As Jann Horn put it [3]: "I think dma-kmalloc slabs should be handled the same way as normal kmalloc slabs. When a dma-kmalloc allocation is freshly created, it is just normal kernel memory - even if it might later be used for DMA -, and it should be perfectly fine to copy_from_user() into such allocations at that point, and to copy_to_user() out of them at the end. If you look at the places where such allocations are created, you can see things like kmemdup(), memcpy() and so on - all normal operations that shouldn't conceptually be different from usercopy in any relevant way." Thus this patch marks the dma-kmalloc-* caches as usercopy. [1] https://bugzilla.suse.com/show_bug.cgi?id=1156053 [2] https://lore.kernel.org/kernel-hardening/bfca96db-bbd0-d958-7732-76e36c667c68@suse.cz/ [3] https://lore.kernel.org/kernel-hardening/CAG48ez1a4waGk9kB0WLaSbs4muSoK0AYAVk8=XYaKj4_+6e6Hg@mail.gmail.com/ Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Acked-by: Christian Borntraeger Acked-by: Jiri Slaby Cc: Jann Horn Cc: Christoph Hellwig Cc: Christopher Lameter Cc: Julian Wiedmann Cc: Ursula Braun Cc: Alexander Viro Cc: David Windsor Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andy Lutomirski Cc: "David S. Miller" Cc: Laura Abbott Cc: Mark Rutland Cc: "Martin K. Petersen" Cc: Paolo Bonzini Cc: Christoffer Dall Cc: Dave Kleikamp Cc: Jan Kara Cc: Luis de Bethencourt Cc: Marc Zyngier Cc: Rik van Riel Cc: Matthew Garrett Cc: Michal Kubecek Link: http://lkml.kernel.org/r/7d810f6d-8085-ea2f-7805-47ba3842dc50@suse.cz Signed-off-by: Linus Torvalds --- mm/slab_common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 23c7500eea7d..9e72ba224175 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1303,7 +1303,8 @@ void __init create_kmalloc_caches(slab_flags_t flags) kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache( kmalloc_info[i].name[KMALLOC_DMA], kmalloc_info[i].size, - SLAB_CACHE_DMA | flags, 0, 0); + SLAB_CACHE_DMA | flags, 0, + kmalloc_info[i].size); } } #endif -- cgit v1.2.3 From 52f23478081ae0dcdb95d1650ea1e7d52d586829 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Mon, 1 Jun 2020 21:45:47 -0700 Subject: mm/slub.c: fix corrupted freechain in deactivate_slab() The slub_debug is able to fix the corrupted slab freelist/page. However, alloc_debug_processing() only checks the validity of current and next freepointer during allocation path. As a result, once some objects have their freepointers corrupted, deactivate_slab() may lead to page fault. Below is from a test kernel module when 'slub_debug=PUF,kmalloc-128 slub_nomerge'. The test kernel corrupts the freepointer of one free object on purpose. Unfortunately, deactivate_slab() does not detect it when iterating the freechain. BUG: unable to handle page fault for address: 00000000123456f8 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI ... ... RIP: 0010:deactivate_slab.isra.92+0xed/0x490 ... ... Call Trace: ___slab_alloc+0x536/0x570 __slab_alloc+0x17/0x30 __kmalloc+0x1d9/0x200 ext4_htree_store_dirent+0x30/0xf0 htree_dirblock_to_tree+0xcb/0x1c0 ext4_htree_fill_tree+0x1bc/0x2d0 ext4_readdir+0x54f/0x920 iterate_dir+0x88/0x190 __x64_sys_getdents+0xa6/0x140 do_syscall_64+0x49/0x170 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Therefore, this patch adds extra consistency check in deactivate_slab(). Once an object's freepointer is corrupted, all following objects starting at this object are isolated. [akpm@linux-foundation.org: fix build with CONFIG_SLAB_DEBUG=n] Signed-off-by: Dongli Zhang Signed-off-by: Andrew Morton Cc: Joe Jin Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Link: http://lkml.kernel.org/r/20200331031450.12182-1-dongli.zhang@oracle.com Signed-off-by: Linus Torvalds --- mm/slub.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/mm/slub.c b/mm/slub.c index b762450fc9f0..6972c27ae394 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -679,6 +679,20 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...) va_end(args); } +static bool freelist_corrupted(struct kmem_cache *s, struct page *page, + void *freelist, void *nextfree) +{ + if ((s->flags & SLAB_CONSISTENCY_CHECKS) && + !check_valid_pointer(s, page, nextfree)) { + object_err(s, page, freelist, "Freechain corrupt"); + freelist = NULL; + slab_fix(s, "Isolate corrupted freechain"); + return true; + } + + return false; +} + static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) { unsigned int off; /* Offset of last byte */ @@ -1410,6 +1424,11 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} +static bool freelist_corrupted(struct kmem_cache *s, struct page *page, + void *freelist, void *nextfree) +{ + return false; +} #endif /* CONFIG_SLUB_DEBUG */ /* @@ -2093,6 +2112,14 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, void *prior; unsigned long counters; + /* + * If 'nextfree' is invalid, it is possible that the object at + * 'freelist' is already corrupted. So isolate all objects + * starting at 'freelist'. + */ + if (freelist_corrupted(s, page, freelist, nextfree)) + break; + do { prior = page->freelist; counters = page->counters; -- cgit v1.2.3 From d7660ce5914d396242bfc56c8f45ef117101fb58 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 1 Jun 2020 21:45:50 -0700 Subject: slub: Remove userspace notifier for cache add/remove MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I came across some unnecessary uevents once again which reminded me this. The patch seems to be lost in the leaves of the original discussion [1], so resending. [1] https://lore.kernel.org/r/alpine.DEB.2.21.2001281813130.745@www.lameter.com Kmem caches are internal kernel structures so it is strange that userspace notifiers would be needed. And I am not aware of any use of these notifiers. These notifiers may just exist because in the initial slub release the sysfs code was copied from another subsystem. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Acked-by: Vlastimil Babka Acked-by: Michal Koutný Acked-by: David Rientjes Cc: Pekka Enberg Cc: Joonsoo Kim Link: http://lkml.kernel.org/r/20200423115721.19821-1-mkoutny@suse.com Signed-off-by: Linus Torvalds --- mm/slub.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 6972c27ae394..2ae5580433af 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5715,19 +5715,6 @@ static struct kobj_type slab_ktype = { .release = kmem_cache_release, }; -static int uevent_filter(struct kset *kset, struct kobject *kobj) -{ - struct kobj_type *ktype = get_ktype(kobj); - - if (ktype == &slab_ktype) - return 1; - return 0; -} - -static const struct kset_uevent_ops slab_uevent_ops = { - .filter = uevent_filter, -}; - static struct kset *slab_kset; static inline struct kset *cache_kset(struct kmem_cache *s) @@ -5795,7 +5782,6 @@ static void sysfs_slab_remove_workfn(struct work_struct *work) #ifdef CONFIG_MEMCG kset_unregister(s->memcg_kset); #endif - kobject_uevent(&s->kobj, KOBJ_REMOVE); out: kobject_put(&s->kobj); } @@ -5853,7 +5839,6 @@ static int sysfs_slab_add(struct kmem_cache *s) } #endif - kobject_uevent(&s->kobj, KOBJ_ADD); if (!unmergeable) { /* Setup first alias */ sysfs_slab_alias(s, s->name); @@ -5934,7 +5919,7 @@ static int __init slab_sysfs_init(void) mutex_lock(&slab_mutex); - slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); + slab_kset = kset_create_and_add("slab", NULL, kernel_kobj); if (!slab_kset) { mutex_unlock(&slab_mutex); pr_err("Cannot register slab subsystem.\n"); -- cgit v1.2.3 From aa456c7aebb14a4ff47611586397b5b3e84fbf37 Mon Sep 17 00:00:00 2001 From: Christopher Lameter Date: Mon, 1 Jun 2020 21:45:53 -0700 Subject: slub: remove kmalloc under list_lock from list_slab_objects() V2 list_slab_objects() is called when a slab is destroyed and there are objects still left to list the objects in the syslog. This is a pretty rare event. And there it seems we take the list_lock and call kmalloc while holding that lock. Perform the allocation in free_partial() before the list_lock is taken. Fixes: bbd7d57bfe852d9788bae5fb171c7edb4021d8ac ("slub: Potential stack overflow") Signed-off-by: Christopher Lameter Signed-off-by: Andrew Morton Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: "Kirill A. Shutemov" Cc: Tetsuo Handa Cc: Yu Zhao Link: http://lkml.kernel.org/r/alpine.DEB.2.21.2002031721250.1668@www.lameter.com Signed-off-by: Linus Torvalds --- mm/slub.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 2ae5580433af..3a76de69a268 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3766,12 +3766,14 @@ error: } static void list_slab_objects(struct kmem_cache *s, struct page *page, - const char *text) + const char *text, unsigned long *map) { #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); void *p; - unsigned long *map; + + if (!map) + return; slab_err(s, page, text, s->name); slab_lock(page); @@ -3784,8 +3786,6 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, print_tracking(s, p); } } - put_map(map); - slab_unlock(page); #endif } @@ -3799,6 +3799,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { LIST_HEAD(discard); struct page *page, *h; + unsigned long *map = NULL; + +#ifdef CONFIG_SLUB_DEBUG + map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); +#endif BUG_ON(irqs_disabled()); spin_lock_irq(&n->list_lock); @@ -3808,11 +3813,16 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) list_add(&page->slab_list, &discard); } else { list_slab_objects(s, page, - "Objects remaining in %s on __kmem_cache_shutdown()"); + "Objects remaining in %s on __kmem_cache_shutdown()", + map); } } spin_unlock_irq(&n->list_lock); +#ifdef CONFIG_SLUB_DEBUG + bitmap_free(map); +#endif + list_for_each_entry_safe(page, h, &discard, slab_list) discard_slab(s, page); } -- cgit v1.2.3 From a68ee0573991e90af2f1785db309206408bad3e5 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 1 Jun 2020 21:45:57 -0700 Subject: mm/slub: fix stack overruns with SLUB_STATS There is no need to copy SLUB_STATS items from root memcg cache to new memcg cache copies. Doing so could result in stack overruns because the store function only accepts 0 to clear the stat and returns an error for everything else while the show method would print out the whole stat. Then, the mismatch of the lengths returns from show and store methods happens in memcg_propagate_slab_attrs(): else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf)) buf = mbuf; max_attr_size is only 2 from slab_attr_store(), then, it uses mbuf[64] in show_stat() later where a bounch of sprintf() would overrun the stack variable. Fix it by always allocating a page of buffer to be used in show_stat() if SLUB_STATS=y which should only be used for debug purpose. # echo 1 > /sys/kernel/slab/fs_cache/shrink BUG: KASAN: stack-out-of-bounds in number+0x421/0x6e0 Write of size 1 at addr ffffc900256cfde0 by task kworker/76:0/53251 Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 07/10/2019 Workqueue: memcg_kmem_cache memcg_kmem_cache_create_func Call Trace: number+0x421/0x6e0 vsnprintf+0x451/0x8e0 sprintf+0x9e/0xd0 show_stat+0x124/0x1d0 alloc_slowpath_show+0x13/0x20 __kmem_cache_create+0x47a/0x6b0 addr ffffc900256cfde0 is located in stack of task kworker/76:0/53251 at offset 0 in frame: process_one_work+0x0/0xb90 this frame has 1 object: [32, 72) 'lockdep_map' Memory state around the buggy address: ffffc900256cfc80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffffc900256cfd00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >ffffc900256cfd80: 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 ^ ffffc900256cfe00: 00 00 00 00 00 f2 f2 f2 00 00 00 00 00 00 00 00 ffffc900256cfe80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ================================================================== Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: __kmem_cache_create+0x6ac/0x6b0 Workqueue: memcg_kmem_cache memcg_kmem_cache_create_func Call Trace: __kmem_cache_create+0x6ac/0x6b0 Fixes: 107dab5c92d5 ("slub: slub-specific propagation changes") Signed-off-by: Qian Cai Signed-off-by: Andrew Morton Cc: Glauber Costa Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Link: http://lkml.kernel.org/r/20200429222356.4322-1-cai@lca.pw Signed-off-by: Linus Torvalds --- mm/slub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 3a76de69a268..2c56cc9e4ff2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5691,7 +5691,8 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) */ if (buffer) buf = buffer; - else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf)) + else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf) && + !IS_ENABLED(CONFIG_SLUB_STATS)) buf = mbuf; else { buffer = (char *) get_zeroed_page(GFP_KERNEL); -- cgit v1.2.3 From a3df69278c5052acf0a5335b3fc614e0a7e5ea93 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 1 Jun 2020 21:46:00 -0700 Subject: Documentation/vm/slub.rst: s/Toggle/Enable/ "toggle" means to change a boolean thing's state. This operation doesn't do that - it sets it to "true". Signed-off-by: Andrew Morton Acked-by: Rafael Aquini Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Signed-off-by: Linus Torvalds --- Documentation/vm/slub.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst index 933ada4368ff..4eee598555c9 100644 --- a/Documentation/vm/slub.rst +++ b/Documentation/vm/slub.rst @@ -49,7 +49,7 @@ Possible debug options are:: P Poisoning (object and padding) U User tracking (free and alloc) T Trace (please only use on single slabs) - A Toggle failslab filter mark for the cache + A Enable failslab filter mark for the cache O Switch debugging off for caches that would have caused higher minimum slab orders - Switch all debugging off (useful if the kernel is -- cgit v1.2.3 From 002ae7057069538aa3afd500f6f60a429cb948b2 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 1 Jun 2020 21:46:03 -0700 Subject: mm, dump_page(): do not crash with invalid mapping pointer We have seen a following problem on a RPi4 with 1G RAM: BUG: Bad page state in process systemd-hwdb pfn:35601 page:ffff7e0000d58040 refcount:15 mapcount:131221 mapping:efd8fe765bc80080 index:0x1 compound_mapcount: -32767 Unable to handle kernel paging request at virtual address efd8fe765bc80080 Mem abort info: ESR = 0x96000004 Exception class = DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000004 CM = 0, WnR = 0 [efd8fe765bc80080] address between user and kernel address ranges Internal error: Oops: 96000004 [#1] SMP Modules linked in: btrfs libcrc32c xor xor_neon zlib_deflate raid6_pq mmc_block xhci_pci xhci_hcd usbcore sdhci_iproc sdhci_pltfm sdhci mmc_core clk_raspberrypi gpio_raspberrypi_exp pcie_brcmstb bcm2835_dma gpio_regulator phy_generic fixed sg scsi_mod efivarfs Supported: No, Unreleased kernel CPU: 3 PID: 408 Comm: systemd-hwdb Not tainted 5.3.18-8-default #1 SLE15-SP2 (unreleased) Hardware name: raspberrypi rpi/rpi, BIOS 2020.01 02/21/2020 pstate: 40000085 (nZcv daIf -PAN -UAO) pc : __dump_page+0x268/0x368 lr : __dump_page+0xc4/0x368 sp : ffff000012563860 x29: ffff000012563860 x28: ffff80003ddc4300 x27: 0000000000000010 x26: 000000000000003f x25: ffff7e0000d58040 x24: 000000000000000f x23: efd8fe765bc80080 x22: 0000000000020095 x21: efd8fe765bc80080 x20: ffff000010ede8b0 x19: ffff7e0000d58040 x18: ffffffffffffffff x17: 0000000000000001 x16: 0000000000000007 x15: ffff000011689708 x14: 3030386362353637 x13: 6566386466653a67 x12: 6e697070616d2031 x11: 32323133313a746e x10: 756f6370616d2035 x9 : ffff00001168a840 x8 : ffff00001077a670 x7 : 000000000000013d x6 : ffff0000118a43b5 x5 : 0000000000000001 x4 : ffff80003dd9e2c8 x3 : ffff80003dd9e2c8 x2 : 911c8d7c2f483500 x1 : dead000000000100 x0 : efd8fe765bc80080 Call trace: __dump_page+0x268/0x368 bad_page+0xd4/0x168 check_new_page_bad+0x80/0xb8 rmqueue_bulk.constprop.26+0x4d8/0x788 get_page_from_freelist+0x4d4/0x1228 __alloc_pages_nodemask+0x134/0xe48 alloc_pages_vma+0x198/0x1c0 do_anonymous_page+0x1a4/0x4d8 __handle_mm_fault+0x4e8/0x560 handle_mm_fault+0x104/0x1e0 do_page_fault+0x1e8/0x4c0 do_translation_fault+0xb0/0xc0 do_mem_abort+0x50/0xb0 el0_da+0x24/0x28 Code: f9401025 8b8018a0 9a851005 17ffffca (f94002a0) Besides the underlying issue with page->mapping containing a bogus value for some reason, we can see that __dump_page() crashed by trying to read the pointer at mapping->host, turning a recoverable warning into full Oops. It can be expected that when page is reported as bad state for some reason, the pointers there should not be trusted blindly. So this patch treats all data in __dump_page() that depends on page->mapping as lava, using probe_kernel_read_strict(). Ideally this would include the dentry->d_parent recursively, but that would mean changing printk handler for %pd. Chances of reaching the dentry printing part with an initially bogus mapping pointer should be rather low, though. Also prefix printing mapping->a_ops with a description of what is being printed. In case the value is bogus, %ps will print raw value instead of the symbol name and then it's not obvious at all that it's printing a_ops. Reported-by: Petr Tesarik Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Acked-by: Kirill A. Shutemov Cc: Matthew Wilcox Cc: John Hubbard Link: http://lkml.kernel.org/r/20200331165454.12263-1-vbabka@suse.cz Signed-off-by: Linus Torvalds --- mm/debug.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/mm/debug.c b/mm/debug.c index 2189357f0987..f2ede2df585a 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -110,13 +110,57 @@ void __dump_page(struct page *page, const char *reason) else if (PageAnon(page)) type = "anon "; else if (mapping) { - if (mapping->host && mapping->host->i_dentry.first) { - struct dentry *dentry; - dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias); - pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry); - } else - pr_warn("%ps\n", mapping->a_ops); + const struct inode *host; + const struct address_space_operations *a_ops; + const struct hlist_node *dentry_first; + const struct dentry *dentry_ptr; + struct dentry dentry; + + /* + * mapping can be invalid pointer and we don't want to crash + * accessing it, so probe everything depending on it carefully + */ + if (probe_kernel_read_strict(&host, &mapping->host, + sizeof(struct inode *)) || + probe_kernel_read_strict(&a_ops, &mapping->a_ops, + sizeof(struct address_space_operations *))) { + pr_warn("failed to read mapping->host or a_ops, mapping not a valid kernel address?\n"); + goto out_mapping; + } + + if (!host) { + pr_warn("mapping->a_ops:%ps\n", a_ops); + goto out_mapping; + } + + if (probe_kernel_read_strict(&dentry_first, + &host->i_dentry.first, sizeof(struct hlist_node *))) { + pr_warn("mapping->a_ops:%ps with invalid mapping->host inode address %px\n", + a_ops, host); + goto out_mapping; + } + + if (!dentry_first) { + pr_warn("mapping->a_ops:%ps\n", a_ops); + goto out_mapping; + } + + dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); + if (probe_kernel_read_strict(&dentry, dentry_ptr, + sizeof(struct dentry))) { + pr_warn("mapping->aops:%ps with invalid mapping->host->i_dentry.first %px\n", + a_ops, dentry_ptr); + } else { + /* + * if dentry is corrupted, the %pd handler may still + * crash, but it's unlikely that we reach here with a + * corrupted struct page + */ + pr_warn("mapping->aops:%ps dentry name:\"%pd\"\n", + a_ops, &dentry); + } } +out_mapping: BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); pr_warn("%sflags: %#lx(%pGp)%s\n", type, page->flags, &page->flags, -- cgit v1.2.3 From cee9a0c4e84db024d692d6b5c18f65465eb06905 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:07 -0700 Subject: mm: move readahead prototypes from mm.h Patch series "Change readahead API", v11. This series adds a readahead address_space operation to replace the readpages operation. The key difference is that pages are added to the page cache as they are allocated (and then looked up by the filesystem) instead of passing them on a list to the readpages operation and having the filesystem add them to the page cache. It's a net reduction in code for each implementation, more efficient than walking a list, and solves the direct-write vs buffered-read problem reported by yu kuai at http://lkml.kernel.org/r/20200116063601.39201-1-yukuai3@huawei.com The only unconverted filesystems are those which use fscache. Their conversion is pending Dave Howells' rewrite which will make the conversion substantially easier. This should be completed by the end of the year. I want to thank the reviewers/testers; Dave Chinner, John Hubbard, Eric Biggers, Johannes Thumshirn, Dave Sterba, Zi Yan, Christoph Hellwig and Miklos Szeredi have done a marvellous job of providing constructive criticism. These patches pass an xfstests run on ext4, xfs & btrfs with no regressions that I can tell (some of the tests seem a little flaky before and remain flaky afterwards). This patch (of 25): The readahead code is part of the page cache so should be found in the pagemap.h file. force_page_cache_readahead is only used within mm, so move it to mm/internal.h instead. Remove the parameter names where they add no value, and rename the ones which were actively misleading. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: John Hubbard Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski Reviewed-by: Johannes Thumshirn Cc: Chao Yu Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-1-willy@infradead.org Link: http://lkml.kernel.org/r/20200414150233.24495-2-willy@infradead.org Signed-off-by: Linus Torvalds --- block/blk-core.c | 1 + include/linux/mm.h | 19 ------------------- include/linux/pagemap.h | 8 ++++++++ mm/fadvise.c | 2 ++ mm/internal.h | 2 ++ 5 files changed, 13 insertions(+), 19 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 9bfaee050c82..38d7b1f16067 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/mm.h b/include/linux/mm.h index f3fe7371855c..92704fde6475 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2612,25 +2612,6 @@ extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); int __must_check write_one_page(struct page *page); void task_dirty_inc(struct task_struct *tsk); -/* readahead.c */ -#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) - -int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read); - -void page_cache_sync_readahead(struct address_space *mapping, - struct file_ra_state *ra, - struct file *filp, - pgoff_t offset, - unsigned long size); - -void page_cache_async_readahead(struct address_space *mapping, - struct file_ra_state *ra, - struct file *filp, - struct page *pg, - pgoff_t offset, - unsigned long size); - extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d4409b13747e..8c081cda4b35 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -618,6 +618,14 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); void delete_from_page_cache_batch(struct address_space *mapping, struct pagevec *pvec); +#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) + +void page_cache_sync_readahead(struct address_space *, struct file_ra_state *, + struct file *, pgoff_t index, unsigned long req_count); +void page_cache_async_readahead(struct address_space *, struct file_ra_state *, + struct file *, struct page *, pgoff_t index, + unsigned long req_count); + /* * Like add_to_page_cache_locked, but used to add newly allocated pages: * the page is new, so we can just run __SetPageLocked() against it. diff --git a/mm/fadvise.c b/mm/fadvise.c index 4f17c83db575..3efebfb9952c 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -22,6 +22,8 @@ #include +#include "internal.h" + /* * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. diff --git a/mm/internal.h b/mm/internal.h index b5634e78f01d..25fee17c7334 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -49,6 +49,8 @@ void unmap_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, struct zap_details *details); +int force_page_cache_readahead(struct address_space *, struct file *, + pgoff_t index, unsigned long nr_to_read); extern unsigned int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size); -- cgit v1.2.3 From 9a42823a102eb10dd1cc09930dc7e20042698e23 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:10 -0700 Subject: mm: return void from various readahead functions ondemand_readahead has two callers, neither of which use the return value. That means that both ra_submit and __do_page_cache_readahead() can return void, and we don't need to worry that a present page in the readahead window causes us to return a smaller nr_pages than we ought to have. Similarly, no caller uses the return value from force_page_cache_readahead(). Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Dave Chinner Reviewed-by: John Hubbard Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski Cc: Chao Yu Cc: Cong Wang Cc: Darrick J. Wong Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-3-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/fadvise.c | 4 ---- mm/internal.h | 12 ++++++------ mm/readahead.c | 31 +++++++++++++------------------ 3 files changed, 19 insertions(+), 28 deletions(-) diff --git a/mm/fadvise.c b/mm/fadvise.c index 3efebfb9952c..0e66f2aaeea3 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -104,10 +104,6 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) if (!nrpages) nrpages = ~0UL; - /* - * Ignore return value because fadvise() shall return - * success even if filesystem can't retrieve a hint, - */ force_page_cache_readahead(mapping, file, start_index, nrpages); break; case POSIX_FADV_NOREUSE: diff --git a/mm/internal.h b/mm/internal.h index 25fee17c7334..f762a34b0c57 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -49,20 +49,20 @@ void unmap_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, struct zap_details *details); -int force_page_cache_readahead(struct address_space *, struct file *, +void force_page_cache_readahead(struct address_space *, struct file *, pgoff_t index, unsigned long nr_to_read); -extern unsigned int __do_page_cache_readahead(struct address_space *mapping, - struct file *filp, pgoff_t offset, unsigned long nr_to_read, +void __do_page_cache_readahead(struct address_space *, struct file *, + pgoff_t index, unsigned long nr_to_read, unsigned long lookahead_size); /* * Submit IO for the read-ahead request in file_ra_state. */ -static inline unsigned long ra_submit(struct file_ra_state *ra, +static inline void ra_submit(struct file_ra_state *ra, struct address_space *mapping, struct file *filp) { - return __do_page_cache_readahead(mapping, filp, - ra->start, ra->size, ra->async_size); + __do_page_cache_readahead(mapping, filp, + ra->start, ra->size, ra->async_size); } /** diff --git a/mm/readahead.c b/mm/readahead.c index 2fe72cd29b47..41a592886da7 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -149,10 +149,8 @@ out: * the pages first, then submits them for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. - * - * Returns the number of pages requested, or the maximum amount of I/O allowed. */ -unsigned int __do_page_cache_readahead(struct address_space *mapping, +void __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size) { @@ -166,7 +164,7 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping, gfp_t gfp_mask = readahead_gfp_mask(mapping); if (isize == 0) - goto out; + return; end_index = ((isize - 1) >> PAGE_SHIFT); @@ -211,23 +209,21 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping, if (nr_pages) read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask); BUG_ON(!list_empty(&page_pool)); -out: - return nr_pages; } /* * Chunk the readahead into 2 megabyte units, so that we don't pin too much * memory at once. */ -int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read) +void force_page_cache_readahead(struct address_space *mapping, + struct file *filp, pgoff_t offset, unsigned long nr_to_read) { struct backing_dev_info *bdi = inode_to_bdi(mapping->host); struct file_ra_state *ra = &filp->f_ra; unsigned long max_pages; if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) - return -EINVAL; + return; /* * If the request exceeds the readahead window, allow the read to @@ -245,7 +241,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, offset += this_chunk; nr_to_read -= this_chunk; } - return 0; } /* @@ -378,11 +373,10 @@ static int try_context_readahead(struct address_space *mapping, /* * A minimal readahead algorithm for trivial sequential/random reads. */ -static unsigned long -ondemand_readahead(struct address_space *mapping, - struct file_ra_state *ra, struct file *filp, - bool hit_readahead_marker, pgoff_t offset, - unsigned long req_size) +static void ondemand_readahead(struct address_space *mapping, + struct file_ra_state *ra, struct file *filp, + bool hit_readahead_marker, pgoff_t offset, + unsigned long req_size) { struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages = ra->ra_pages; @@ -428,7 +422,7 @@ ondemand_readahead(struct address_space *mapping, rcu_read_unlock(); if (!start || start - offset > max_pages) - return 0; + return; ra->start = start; ra->size = start - offset; /* old async_size */ @@ -464,7 +458,8 @@ ondemand_readahead(struct address_space *mapping, * standalone, small random read * Read as is, and do not pollute the readahead state. */ - return __do_page_cache_readahead(mapping, filp, offset, req_size, 0); + __do_page_cache_readahead(mapping, filp, offset, req_size, 0); + return; initial_readahead: ra->start = offset; @@ -489,7 +484,7 @@ readit: } } - return ra_submit(ra, mapping, filp); + ra_submit(ra, mapping, filp); } /** -- cgit v1.2.3 From a1ef8566525c78a9eca52e1ff91404f4595b85eb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:14 -0700 Subject: mm: ignore return value of ->readpages We used to assign the return value to a variable, which we then ignored. Remove the pretence of caring. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: John Hubbard Reviewed-by: William Kucharski Reviewed-by: Johannes Thumshirn Cc: Chao Yu Cc: Cong Wang Cc: Darrick J. Wong Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-4-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/readahead.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 41a592886da7..61b15b6b9e72 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -113,17 +113,16 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, EXPORT_SYMBOL(read_cache_pages); -static int read_pages(struct address_space *mapping, struct file *filp, +static void read_pages(struct address_space *mapping, struct file *filp, struct list_head *pages, unsigned int nr_pages, gfp_t gfp) { struct blk_plug plug; unsigned page_idx; - int ret; blk_start_plug(&plug); if (mapping->a_ops->readpages) { - ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); + mapping->a_ops->readpages(filp, mapping, pages, nr_pages); /* Clean up the remaining pages */ put_pages_list(pages); goto out; @@ -136,12 +135,9 @@ static int read_pages(struct address_space *mapping, struct file *filp, mapping->a_ops->readpage(filp, page); put_page(page); } - ret = 0; out: blk_finish_plug(&plug); - - return ret; } /* -- cgit v1.2.3 From ad4ae1c732bc9159ffdeb225036c601dddbbbe75 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:18 -0700 Subject: mm: move readahead nr_pages check into read_pages Simplify the callers by moving the check for nr_pages and the BUG_ON into read_pages(). Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Zi Yan Reviewed-by: John Hubbard Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski Reviewed-by: Johannes Thumshirn Cc: Chao Yu Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-5-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/readahead.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 61b15b6b9e72..9fcd4e32b62d 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -119,6 +119,9 @@ static void read_pages(struct address_space *mapping, struct file *filp, struct blk_plug plug; unsigned page_idx; + if (!nr_pages) + return; + blk_start_plug(&plug); if (mapping->a_ops->readpages) { @@ -138,6 +141,8 @@ static void read_pages(struct address_space *mapping, struct file *filp, out: blk_finish_plug(&plug); + + BUG_ON(!list_empty(pages)); } /* @@ -180,8 +185,7 @@ void __do_page_cache_readahead(struct address_space *mapping, * contiguous pages before continuing with the next * batch. */ - if (nr_pages) - read_pages(mapping, filp, &page_pool, nr_pages, + read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask); nr_pages = 0; continue; @@ -202,9 +206,7 @@ void __do_page_cache_readahead(struct address_space *mapping, * uptodate then the caller will launch readpage again, and * will then handle the error. */ - if (nr_pages) - read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask); - BUG_ON(!list_empty(&page_pool)); + read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask); } /* -- cgit v1.2.3 From 042124cc64c33555deba0b11c6e0c612ae7a8653 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:21 -0700 Subject: mm: add new readahead_control API Filesystems which implement the upcoming ->readahead method will get their pages by calling readahead_page() or readahead_page_batch(). These functions support large pages, even though none of the filesystems to be converted do yet. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski Cc: Chao Yu Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: John Hubbard Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-6-willy@infradead.org Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 140 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 8c081cda4b35..c3bf73263ec9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -642,6 +642,146 @@ static inline int add_to_page_cache(struct page *page, return error; } +/** + * struct readahead_control - Describes a readahead request. + * + * A readahead request is for consecutive pages. Filesystems which + * implement the ->readahead method should call readahead_page() or + * readahead_page_batch() in a loop and attempt to start I/O against + * each page in the request. + * + * Most of the fields in this struct are private and should be accessed + * by the functions below. + * + * @file: The file, used primarily by network filesystems for authentication. + * May be NULL if invoked internally by the filesystem. + * @mapping: Readahead this filesystem object. + */ +struct readahead_control { + struct file *file; + struct address_space *mapping; +/* private: use the readahead_* accessors instead */ + pgoff_t _index; + unsigned int _nr_pages; + unsigned int _batch_count; +}; + +/** + * readahead_page - Get the next page to read. + * @rac: The current readahead request. + * + * Context: The page is locked and has an elevated refcount. The caller + * should decreases the refcount once the page has been submitted for I/O + * and unlock the page once all I/O to that page has completed. + * Return: A pointer to the next page, or %NULL if we are done. + */ +static inline struct page *readahead_page(struct readahead_control *rac) +{ + struct page *page; + + BUG_ON(rac->_batch_count > rac->_nr_pages); + rac->_nr_pages -= rac->_batch_count; + rac->_index += rac->_batch_count; + + if (!rac->_nr_pages) { + rac->_batch_count = 0; + return NULL; + } + + page = xa_load(&rac->mapping->i_pages, rac->_index); + VM_BUG_ON_PAGE(!PageLocked(page), page); + rac->_batch_count = hpage_nr_pages(page); + + return page; +} + +static inline unsigned int __readahead_batch(struct readahead_control *rac, + struct page **array, unsigned int array_sz) +{ + unsigned int i = 0; + XA_STATE(xas, &rac->mapping->i_pages, 0); + struct page *page; + + BUG_ON(rac->_batch_count > rac->_nr_pages); + rac->_nr_pages -= rac->_batch_count; + rac->_index += rac->_batch_count; + rac->_batch_count = 0; + + xas_set(&xas, rac->_index); + rcu_read_lock(); + xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) { + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(PageTail(page), page); + array[i++] = page; + rac->_batch_count += hpage_nr_pages(page); + + /* + * The page cache isn't using multi-index entries yet, + * so the xas cursor needs to be manually moved to the + * next index. This can be removed once the page cache + * is converted. + */ + if (PageHead(page)) + xas_set(&xas, rac->_index + rac->_batch_count); + + if (i == array_sz) + break; + } + rcu_read_unlock(); + + return i; +} + +/** + * readahead_page_batch - Get a batch of pages to read. + * @rac: The current readahead request. + * @array: An array of pointers to struct page. + * + * Context: The pages are locked and have an elevated refcount. The caller + * should decreases the refcount once the page has been submitted for I/O + * and unlock the page once all I/O to that page has completed. + * Return: The number of pages placed in the array. 0 indicates the request + * is complete. + */ +#define readahead_page_batch(rac, array) \ + __readahead_batch(rac, array, ARRAY_SIZE(array)) + +/** + * readahead_pos - The byte offset into the file of this readahead request. + * @rac: The readahead request. + */ +static inline loff_t readahead_pos(struct readahead_control *rac) +{ + return (loff_t)rac->_index * PAGE_SIZE; +} + +/** + * readahead_length - The number of bytes in this readahead request. + * @rac: The readahead request. + */ +static inline loff_t readahead_length(struct readahead_control *rac) +{ + return (loff_t)rac->_nr_pages * PAGE_SIZE; +} + +/** + * readahead_index - The index of the first page in this readahead request. + * @rac: The readahead request. + */ +static inline pgoff_t readahead_index(struct readahead_control *rac) +{ + return rac->_index; +} + +/** + * readahead_count - The number of pages in this readahead request. + * @rac: The readahead request. + */ +static inline unsigned int readahead_count(struct readahead_control *rac) +{ + return rac->_nr_pages; +} + static inline unsigned long dir_pages(struct inode *inode) { return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >> -- cgit v1.2.3 From a4d9653666b3212032e2794dd506b34f94a60d40 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:25 -0700 Subject: mm: use readahead_control to pass arguments In this patch, only between __do_page_cache_readahead() and read_pages(), but it will be extended in upcoming patches. The read_pages() function becomes aops centric, as this makes the most sense by the end of the patchset. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: John Hubbard Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski Reviewed-by: Johannes Thumshirn Cc: Chao Yu Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-7-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/readahead.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 9fcd4e32b62d..9d9aa4ffc7d4 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -113,29 +113,32 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, EXPORT_SYMBOL(read_cache_pages); -static void read_pages(struct address_space *mapping, struct file *filp, - struct list_head *pages, unsigned int nr_pages, gfp_t gfp) +static void read_pages(struct readahead_control *rac, struct list_head *pages, + gfp_t gfp) { + const struct address_space_operations *aops = rac->mapping->a_ops; struct blk_plug plug; unsigned page_idx; - if (!nr_pages) + if (!readahead_count(rac)) return; blk_start_plug(&plug); - if (mapping->a_ops->readpages) { - mapping->a_ops->readpages(filp, mapping, pages, nr_pages); + if (aops->readpages) { + aops->readpages(rac->file, rac->mapping, pages, + readahead_count(rac)); /* Clean up the remaining pages */ put_pages_list(pages); goto out; } - for (page_idx = 0; page_idx < nr_pages; page_idx++) { + for (page_idx = 0; page_idx < readahead_count(rac); page_idx++) { struct page *page = lru_to_page(pages); list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) - mapping->a_ops->readpage(filp, page); + if (!add_to_page_cache_lru(page, rac->mapping, page->index, + gfp)) + aops->readpage(rac->file, page); put_page(page); } @@ -143,6 +146,7 @@ out: blk_finish_plug(&plug); BUG_ON(!list_empty(pages)); + rac->_nr_pages = 0; } /* @@ -160,9 +164,12 @@ void __do_page_cache_readahead(struct address_space *mapping, unsigned long end_index; /* The last page we want to read */ LIST_HEAD(page_pool); int page_idx; - unsigned int nr_pages = 0; loff_t isize = i_size_read(inode); gfp_t gfp_mask = readahead_gfp_mask(mapping); + struct readahead_control rac = { + .mapping = mapping, + .file = filp, + }; if (isize == 0) return; @@ -185,9 +192,7 @@ void __do_page_cache_readahead(struct address_space *mapping, * contiguous pages before continuing with the next * batch. */ - read_pages(mapping, filp, &page_pool, nr_pages, - gfp_mask); - nr_pages = 0; + read_pages(&rac, &page_pool, gfp_mask); continue; } @@ -198,7 +203,7 @@ void __do_page_cache_readahead(struct address_space *mapping, list_add(&page->lru, &page_pool); if (page_idx == nr_to_read - lookahead_size) SetPageReadahead(page); - nr_pages++; + rac._nr_pages++; } /* @@ -206,7 +211,7 @@ void __do_page_cache_readahead(struct address_space *mapping, * uptodate then the caller will launch readpage again, and * will then handle the error. */ - read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask); + read_pages(&rac, &page_pool, gfp_mask); } /* -- cgit v1.2.3 From 08eb9658ae128df77c4cd4ca5323d84b6b6e3824 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:29 -0700 Subject: mm: rename various 'offset' parameters to 'index' The word 'offset' is used ambiguously to mean 'byte offset within a page', 'byte offset from the start of the file' and 'page offset from the start of the file'. Use 'index' to mean 'page offset from the start of the file' throughout the readahead code. [ We should probably rename the 'pgoff_t' type to 'pgidx_t' too - Linus ] Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Zi Yan Reviewed-by: William Kucharski Cc: Chao Yu Cc: Christoph Hellwig Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: John Hubbard Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-8-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/readahead.c | 86 ++++++++++++++++++++++++++++------------------------------ 1 file changed, 42 insertions(+), 44 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 9d9aa4ffc7d4..8a65d6bd97e0 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -156,7 +156,7 @@ out: * We really don't want to intermingle reads and writes like that. */ void __do_page_cache_readahead(struct address_space *mapping, - struct file *filp, pgoff_t offset, unsigned long nr_to_read, + struct file *filp, pgoff_t index, unsigned long nr_to_read, unsigned long lookahead_size) { struct inode *inode = mapping->host; @@ -180,7 +180,7 @@ void __do_page_cache_readahead(struct address_space *mapping, * Preallocate as many pages as we will need. */ for (page_idx = 0; page_idx < nr_to_read; page_idx++) { - pgoff_t page_offset = offset + page_idx; + pgoff_t page_offset = index + page_idx; if (page_offset > end_index) break; @@ -219,7 +219,7 @@ void __do_page_cache_readahead(struct address_space *mapping, * memory at once. */ void force_page_cache_readahead(struct address_space *mapping, - struct file *filp, pgoff_t offset, unsigned long nr_to_read) + struct file *filp, pgoff_t index, unsigned long nr_to_read) { struct backing_dev_info *bdi = inode_to_bdi(mapping->host); struct file_ra_state *ra = &filp->f_ra; @@ -239,9 +239,9 @@ void force_page_cache_readahead(struct address_space *mapping, if (this_chunk > nr_to_read) this_chunk = nr_to_read; - __do_page_cache_readahead(mapping, filp, offset, this_chunk, 0); + __do_page_cache_readahead(mapping, filp, index, this_chunk, 0); - offset += this_chunk; + index += this_chunk; nr_to_read -= this_chunk; } } @@ -322,21 +322,21 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, */ /* - * Count contiguously cached pages from @offset-1 to @offset-@max, + * Count contiguously cached pages from @index-1 to @index-@max, * this count is a conservative estimation of * - length of the sequential read sequence, or * - thrashing threshold in memory tight systems */ static pgoff_t count_history_pages(struct address_space *mapping, - pgoff_t offset, unsigned long max) + pgoff_t index, unsigned long max) { pgoff_t head; rcu_read_lock(); - head = page_cache_prev_miss(mapping, offset - 1, max); + head = page_cache_prev_miss(mapping, index - 1, max); rcu_read_unlock(); - return offset - 1 - head; + return index - 1 - head; } /* @@ -344,13 +344,13 @@ static pgoff_t count_history_pages(struct address_space *mapping, */ static int try_context_readahead(struct address_space *mapping, struct file_ra_state *ra, - pgoff_t offset, + pgoff_t index, unsigned long req_size, unsigned long max) { pgoff_t size; - size = count_history_pages(mapping, offset, max); + size = count_history_pages(mapping, index, max); /* * not enough history pages: @@ -363,10 +363,10 @@ static int try_context_readahead(struct address_space *mapping, * starts from beginning of file: * it is a strong indication of long-run stream (or whole-file-read) */ - if (size >= offset) + if (size >= index) size *= 2; - ra->start = offset; + ra->start = index; ra->size = min(size + req_size, max); ra->async_size = 1; @@ -378,13 +378,13 @@ static int try_context_readahead(struct address_space *mapping, */ static void ondemand_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, - bool hit_readahead_marker, pgoff_t offset, + bool hit_readahead_marker, pgoff_t index, unsigned long req_size) { struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages = ra->ra_pages; unsigned long add_pages; - pgoff_t prev_offset; + pgoff_t prev_index; /* * If the request exceeds the readahead window, allow the read to @@ -396,15 +396,15 @@ static void ondemand_readahead(struct address_space *mapping, /* * start of file */ - if (!offset) + if (!index) goto initial_readahead; /* - * It's the expected callback offset, assume sequential access. + * It's the expected callback index, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ - if ((offset == (ra->start + ra->size - ra->async_size) || - offset == (ra->start + ra->size))) { + if ((index == (ra->start + ra->size - ra->async_size) || + index == (ra->start + ra->size))) { ra->start += ra->size; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; @@ -421,14 +421,14 @@ static void ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = page_cache_next_miss(mapping, offset + 1, max_pages); + start = page_cache_next_miss(mapping, index + 1, max_pages); rcu_read_unlock(); - if (!start || start - offset > max_pages) + if (!start || start - index > max_pages) return; ra->start = start; - ra->size = start - offset; /* old async_size */ + ra->size = start - index; /* old async_size */ ra->size += req_size; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; @@ -443,29 +443,29 @@ static void ondemand_readahead(struct address_space *mapping, /* * sequential cache miss - * trivial case: (offset - prev_offset) == 1 - * unaligned reads: (offset - prev_offset) == 0 + * trivial case: (index - prev_index) == 1 + * unaligned reads: (index - prev_index) == 0 */ - prev_offset = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; - if (offset - prev_offset <= 1UL) + prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; + if (index - prev_index <= 1UL) goto initial_readahead; /* * Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ - if (try_context_readahead(mapping, ra, offset, req_size, max_pages)) + if (try_context_readahead(mapping, ra, index, req_size, max_pages)) goto readit; /* * standalone, small random read * Read as is, and do not pollute the readahead state. */ - __do_page_cache_readahead(mapping, filp, offset, req_size, 0); + __do_page_cache_readahead(mapping, filp, index, req_size, 0); return; initial_readahead: - ra->start = offset; + ra->start = index; ra->size = get_init_ra_size(req_size, max_pages); ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; @@ -476,7 +476,7 @@ readit: * the resulted next readahead window into the current one. * Take care of maximum IO pages as above. */ - if (offset == ra->start && ra->size == ra->async_size) { + if (index == ra->start && ra->size == ra->async_size) { add_pages = get_next_ra_size(ra, max_pages); if (ra->size + add_pages <= max_pages) { ra->async_size = add_pages; @@ -495,9 +495,8 @@ readit: * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @filp: passed on to ->readpage() and ->readpages() - * @offset: start offset into @mapping, in pagecache page-sized units - * @req_size: hint: total size of the read which the caller is performing in - * pagecache pages + * @index: Index of first page to be read. + * @req_count: Total number of pages being read by the caller. * * page_cache_sync_readahead() should be called when a cache miss happened: * it will submit the read. The readahead logic may decide to piggyback more @@ -506,7 +505,7 @@ readit: */ void page_cache_sync_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, - pgoff_t offset, unsigned long req_size) + pgoff_t index, unsigned long req_count) { /* no read-ahead */ if (!ra->ra_pages) @@ -517,12 +516,12 @@ void page_cache_sync_readahead(struct address_space *mapping, /* be dumb */ if (filp && (filp->f_mode & FMODE_RANDOM)) { - force_page_cache_readahead(mapping, filp, offset, req_size); + force_page_cache_readahead(mapping, filp, index, req_count); return; } /* do read-ahead */ - ondemand_readahead(mapping, ra, filp, false, offset, req_size); + ondemand_readahead(mapping, ra, filp, false, index, req_count); } EXPORT_SYMBOL_GPL(page_cache_sync_readahead); @@ -531,21 +530,20 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead); * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @filp: passed on to ->readpage() and ->readpages() - * @page: the page at @offset which has the PG_readahead flag set - * @offset: start offset into @mapping, in pagecache page-sized units - * @req_size: hint: total size of the read which the caller is performing in - * pagecache pages + * @page: The page at @index which triggered the readahead call. + * @index: Index of first page to be read. + * @req_count: Total number of pages being read by the caller. * * page_cache_async_readahead() should be called when a page is used which - * has the PG_readahead flag; this is a marker to suggest that the application + * is marked as PageReadahead; this is a marker to suggest that the application * has used up enough of the readahead window that we should start pulling in * more pages. */ void page_cache_async_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, - struct page *page, pgoff_t offset, - unsigned long req_size) + struct page *page, pgoff_t index, + unsigned long req_count) { /* no read-ahead */ if (!ra->ra_pages) @@ -569,7 +567,7 @@ page_cache_async_readahead(struct address_space *mapping, return; /* do read-ahead */ - ondemand_readahead(mapping, ra, filp, true, offset, req_size); + ondemand_readahead(mapping, ra, filp, true, index, req_count); } EXPORT_SYMBOL_GPL(page_cache_async_readahead); -- cgit v1.2.3 From c2c7ad74b16206e0e6e9e80af962e63da778acf9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:32 -0700 Subject: mm: rename readahead loop variable to 'i' Change the type of page_idx to unsigned long, and rename it -- it's just a loop counter, not a page index. Suggested-by: John Hubbard Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Dave Chinner Reviewed-by: William Kucharski Reviewed-by: Johannes Thumshirn Cc: Chao Yu Cc: Christoph Hellwig Cc: Cong Wang Cc: Darrick J. Wong Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-9-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/readahead.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 8a65d6bd97e0..7ce320854bad 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -163,13 +163,13 @@ void __do_page_cache_readahead(struct address_space *mapping, struct page *page; unsigned long end_index; /* The last page we want to read */ LIST_HEAD(page_pool); - int page_idx; loff_t isize = i_size_read(inode); gfp_t gfp_mask = readahead_gfp_mask(mapping); struct readahead_control rac = { .mapping = mapping, .file = filp, }; + unsigned long i; if (isize == 0) return; @@ -179,8 +179,8 @@ void __do_page_cache_readahead(struct address_space *mapping, /* * Preallocate as many pages as we will need. */ - for (page_idx = 0; page_idx < nr_to_read; page_idx++) { - pgoff_t page_offset = index + page_idx; + for (i = 0; i < nr_to_read; i++) { + pgoff_t page_offset = index + i; if (page_offset > end_index) break; @@ -201,7 +201,7 @@ void __do_page_cache_readahead(struct address_space *mapping, break; page->index = page_offset; list_add(&page->lru, &page_pool); - if (page_idx == nr_to_read - lookahead_size) + if (i == nr_to_read - lookahead_size) SetPageReadahead(page); rac._nr_pages++; } -- cgit v1.2.3 From ef8153b609aa9302f1e727218068b84ac8b060e4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:36 -0700 Subject: mm: remove 'page_offset' from readahead loop Replace the page_offset variable with 'index + i'. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: John Hubbard Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski Cc: Chao Yu Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-10-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/readahead.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 7ce320854bad..ddc63d3b07b8 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -180,12 +180,10 @@ void __do_page_cache_readahead(struct address_space *mapping, * Preallocate as many pages as we will need. */ for (i = 0; i < nr_to_read; i++) { - pgoff_t page_offset = index + i; - - if (page_offset > end_index) + if (index + i > end_index) break; - page = xa_load(&mapping->i_pages, page_offset); + page = xa_load(&mapping->i_pages, index + i); if (page && !xa_is_value(page)) { /* * Page already present? Kick off the current batch of @@ -199,7 +197,7 @@ void __do_page_cache_readahead(struct address_space *mapping, page = __page_cache_alloc(gfp_mask); if (!page) break; - page->index = page_offset; + page->index = index + i; list_add(&page->lru, &page_pool); if (i == nr_to_read - lookahead_size) SetPageReadahead(page); -- cgit v1.2.3 From c1f6925e10912c7e329840387730049e5e1848c8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:40 -0700 Subject: mm: put readahead pages in cache earlier When populating the page cache for readahead, mappings that use ->readpages must populate the page cache themselves as the pages are passed on a linked list which would normally be used for the page cache's LRU. For mappings that use ->readpage or the upcoming ->readahead method, we can put the pages into the page cache as soon as they're allocated, which solves a race between readahead and direct IO. It also lets us remove the gfp argument from read_pages(). Use the new readahead_page() API to implement the repeated calls to ->readpage(), just like most filesystems will. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski Cc: Chao Yu Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: John Hubbard Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-11-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/readahead.c | 46 ++++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index ddc63d3b07b8..e52b3a7b9da5 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -114,14 +114,14 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, EXPORT_SYMBOL(read_cache_pages); static void read_pages(struct readahead_control *rac, struct list_head *pages, - gfp_t gfp) + bool skip_page) { const struct address_space_operations *aops = rac->mapping->a_ops; + struct page *page; struct blk_plug plug; - unsigned page_idx; if (!readahead_count(rac)) - return; + goto out; blk_start_plug(&plug); @@ -130,23 +130,23 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, readahead_count(rac)); /* Clean up the remaining pages */ put_pages_list(pages); - goto out; - } - - for (page_idx = 0; page_idx < readahead_count(rac); page_idx++) { - struct page *page = lru_to_page(pages); - list_del(&page->lru); - if (!add_to_page_cache_lru(page, rac->mapping, page->index, - gfp)) + rac->_index += rac->_nr_pages; + rac->_nr_pages = 0; + } else { + while ((page = readahead_page(rac))) { aops->readpage(rac->file, page); - put_page(page); + put_page(page); + } } -out: blk_finish_plug(&plug); BUG_ON(!list_empty(pages)); - rac->_nr_pages = 0; + BUG_ON(readahead_count(rac)); + +out: + if (skip_page) + rac->_index++; } /* @@ -168,6 +168,7 @@ void __do_page_cache_readahead(struct address_space *mapping, struct readahead_control rac = { .mapping = mapping, .file = filp, + ._index = index, }; unsigned long i; @@ -183,6 +184,8 @@ void __do_page_cache_readahead(struct address_space *mapping, if (index + i > end_index) break; + BUG_ON(index + i != rac._index + rac._nr_pages); + page = xa_load(&mapping->i_pages, index + i); if (page && !xa_is_value(page)) { /* @@ -190,15 +193,22 @@ void __do_page_cache_readahead(struct address_space *mapping, * contiguous pages before continuing with the next * batch. */ - read_pages(&rac, &page_pool, gfp_mask); + read_pages(&rac, &page_pool, true); continue; } page = __page_cache_alloc(gfp_mask); if (!page) break; - page->index = index + i; - list_add(&page->lru, &page_pool); + if (mapping->a_ops->readpages) { + page->index = index + i; + list_add(&page->lru, &page_pool); + } else if (add_to_page_cache_lru(page, mapping, index + i, + gfp_mask) < 0) { + put_page(page); + read_pages(&rac, &page_pool, true); + continue; + } if (i == nr_to_read - lookahead_size) SetPageReadahead(page); rac._nr_pages++; @@ -209,7 +219,7 @@ void __do_page_cache_readahead(struct address_space *mapping, * uptodate then the caller will launch readpage again, and * will then handle the error. */ - read_pages(&rac, &page_pool, gfp_mask); + read_pages(&rac, &page_pool, false); } /* -- cgit v1.2.3 From 8151b4c8bee43cea7a28cb0300123df90880e60c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:44 -0700 Subject: mm: add readahead address space operation This replaces ->readpages with a saner interface: - Return void instead of an ignored error code. - Page cache is already populated with locked pages when ->readahead is called. - New arguments can be passed to the implementation without changing all the filesystems that use a common helper function like mpage_readahead(). Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: John Hubbard Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski Cc: Chao Yu Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-12-willy@infradead.org Signed-off-by: Linus Torvalds --- Documentation/filesystems/locking.rst | 6 +++++- Documentation/filesystems/vfs.rst | 15 +++++++++++++++ include/linux/fs.h | 2 ++ mm/readahead.c | 12 ++++++++++-- 4 files changed, 32 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 5057e4d9dcd1..0af2e0e11461 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -239,6 +239,7 @@ prototypes:: int (*readpage)(struct file *, struct page *); int (*writepages)(struct address_space *, struct writeback_control *); int (*set_page_dirty)(struct page *page); + void (*readahead)(struct readahead_control *); int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); int (*write_begin)(struct file *, struct address_space *mapping, @@ -271,7 +272,8 @@ writepage: yes, unlocks (see below) readpage: yes, unlocks writepages: set_page_dirty no -readpages: +readahead: yes, unlocks +readpages: no write_begin: locks the page exclusive write_end: yes, unlocks exclusive bmap: @@ -295,6 +297,8 @@ the request handler (/dev/loop). ->readpage() unlocks the page, either synchronously or via I/O completion. +->readahead() unlocks the pages that I/O is attempted on like ->readpage(). + ->readpages() populates the pagecache with the passed pages and starts I/O against them. They come unlocked upon I/O completion. diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 7d4d09dd5e6d..ed17771c212b 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -706,6 +706,7 @@ cache in your filesystem. The following members are defined: int (*readpage)(struct file *, struct page *); int (*writepages)(struct address_space *, struct writeback_control *); int (*set_page_dirty)(struct page *page); + void (*readahead)(struct readahead_control *); int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); int (*write_begin)(struct file *, struct address_space *mapping, @@ -781,12 +782,26 @@ cache in your filesystem. The following members are defined: If defined, it should set the PageDirty flag, and the PAGECACHE_TAG_DIRTY tag in the radix tree. +``readahead`` + Called by the VM to read pages associated with the address_space + object. The pages are consecutive in the page cache and are + locked. The implementation should decrement the page refcount + after starting I/O on each page. Usually the page will be + unlocked by the I/O completion handler. If the filesystem decides + to stop attempting I/O before reaching the end of the readahead + window, it can simply return. The caller will decrement the page + refcount and unlock the remaining pages for you. Set PageUptodate + if the I/O completes successfully. Setting PageError on any page + will be ignored; simply unlock the page if an I/O error occurs. + ``readpages`` called by the VM to read pages associated with the address_space object. This is essentially just a vector version of readpage. Instead of just one page, several pages are requested. readpages is only used for read-ahead, so read errors are ignored. If anything goes wrong, feel free to give up. + This interface is deprecated and will be removed by the end of + 2020; implement readahead instead. ``write_begin`` Called by the generic buffered write code to ask the filesystem diff --git a/include/linux/fs.h b/include/linux/fs.h index f2fb5b7406b9..1434ed801b80 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -292,6 +292,7 @@ enum positive_aop_returns { struct page; struct address_space; struct writeback_control; +struct readahead_control; /* * Write life time hint values. @@ -375,6 +376,7 @@ struct address_space_operations { */ int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); + void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, diff --git a/mm/readahead.c b/mm/readahead.c index e52b3a7b9da5..d01531ef9f3c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -125,7 +125,14 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, blk_start_plug(&plug); - if (aops->readpages) { + if (aops->readahead) { + aops->readahead(rac); + /* Clean up the remaining pages */ + while ((page = readahead_page(rac))) { + unlock_page(page); + put_page(page); + } + } else if (aops->readpages) { aops->readpages(rac->file, rac->mapping, pages, readahead_count(rac)); /* Clean up the remaining pages */ @@ -233,7 +240,8 @@ void force_page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra = &filp->f_ra; unsigned long max_pages; - if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages && + !mapping->a_ops->readahead)) return; /* -- cgit v1.2.3 From b0f31d78cbc191058e654c8eb062a864b6c9a7eb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:47 -0700 Subject: mm: move end_index check out of readahead loop By reducing nr_to_read, we can eliminate this check from inside the loop. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: John Hubbard Reviewed-by: William Kucharski Cc: Chao Yu Cc: Christoph Hellwig Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-13-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/readahead.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index d01531ef9f3c..998fdd23c0b1 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -167,8 +167,6 @@ void __do_page_cache_readahead(struct address_space *mapping, unsigned long lookahead_size) { struct inode *inode = mapping->host; - struct page *page; - unsigned long end_index; /* The last page we want to read */ LIST_HEAD(page_pool); loff_t isize = i_size_read(inode); gfp_t gfp_mask = readahead_gfp_mask(mapping); @@ -178,22 +176,26 @@ void __do_page_cache_readahead(struct address_space *mapping, ._index = index, }; unsigned long i; + pgoff_t end_index; /* The last page we want to read */ if (isize == 0) return; - end_index = ((isize - 1) >> PAGE_SHIFT); + end_index = (isize - 1) >> PAGE_SHIFT; + if (index > end_index) + return; + /* Don't read past the page containing the last byte of the file */ + if (nr_to_read > end_index - index) + nr_to_read = end_index - index + 1; /* * Preallocate as many pages as we will need. */ for (i = 0; i < nr_to_read; i++) { - if (index + i > end_index) - break; + struct page *page = xa_load(&mapping->i_pages, index + i); BUG_ON(index + i != rac._index + rac._nr_pages); - page = xa_load(&mapping->i_pages, index + i); if (page && !xa_is_value(page)) { /* * Page already present? Kick off the current batch of -- cgit v1.2.3 From 2c684234d36f7e8c80414e4a772911d407e821fa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:51 -0700 Subject: mm: add page_cache_readahead_unbounded ext4 and f2fs have duplicated the guts of the readahead code so they can read past i_size. Instead, separate out the guts of the readahead code so they can call it directly. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Tested-by: Eric Biggers Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski Reviewed-by: Eric Biggers Cc: Chao Yu Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Gao Xiang Cc: Jaegeuk Kim Cc: John Hubbard Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-14-willy@infradead.org Signed-off-by: Linus Torvalds --- fs/ext4/verity.c | 35 ++----------------------- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 3 --- fs/f2fs/verity.c | 35 ++----------------------- include/linux/pagemap.h | 3 +++ mm/readahead.c | 68 ++++++++++++++++++++++++++++++++++--------------- 6 files changed, 55 insertions(+), 91 deletions(-) diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index dc5ec724d889..dec1244dd062 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -342,37 +342,6 @@ static int ext4_get_verity_descriptor(struct inode *inode, void *buf, return desc_size; } -/* - * Prefetch some pages from the file's Merkle tree. - * - * This is basically a stripped-down version of __do_page_cache_readahead() - * which works on pages past i_size. - */ -static void ext4_merkle_tree_readahead(struct address_space *mapping, - pgoff_t start_index, unsigned long count) -{ - LIST_HEAD(pages); - unsigned int nr_pages = 0; - struct page *page; - pgoff_t index; - struct blk_plug plug; - - for (index = start_index; index < start_index + count; index++) { - page = xa_load(&mapping->i_pages, index); - if (!page || xa_is_value(page)) { - page = __page_cache_alloc(readahead_gfp_mask(mapping)); - if (!page) - break; - page->index = index; - list_add(&page->lru, &pages); - nr_pages++; - } - } - blk_start_plug(&plug); - ext4_mpage_readpages(mapping, &pages, NULL, nr_pages, true); - blk_finish_plug(&plug); -} - static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) @@ -386,8 +355,8 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - ext4_merkle_tree_readahead(inode->i_mapping, index, - num_ra_pages); + page_cache_readahead_unbounded(inode->i_mapping, NULL, + index, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cdf2f626bea7..ae14e952df4f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2177,7 +2177,7 @@ out: * use ->readpage() or do the necessary surgery to decouple ->readpages() * from read-ahead. */ -int f2fs_mpage_readpages(struct address_space *mapping, +static int f2fs_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, unsigned nr_pages, bool is_readahead) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 157eec348970..b5a5da74c013 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3373,9 +3373,6 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); -int f2fs_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, int op_flags, bool for_write); struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index); diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index d7d430a6f130..865c9fb774fb 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -222,37 +222,6 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, return size; } -/* - * Prefetch some pages from the file's Merkle tree. - * - * This is basically a stripped-down version of __do_page_cache_readahead() - * which works on pages past i_size. - */ -static void f2fs_merkle_tree_readahead(struct address_space *mapping, - pgoff_t start_index, unsigned long count) -{ - LIST_HEAD(pages); - unsigned int nr_pages = 0; - struct page *page; - pgoff_t index; - struct blk_plug plug; - - for (index = start_index; index < start_index + count; index++) { - page = xa_load(&mapping->i_pages, index); - if (!page || xa_is_value(page)) { - page = __page_cache_alloc(readahead_gfp_mask(mapping)); - if (!page) - break; - page->index = index; - list_add(&page->lru, &pages); - nr_pages++; - } - } - blk_start_plug(&plug); - f2fs_mpage_readpages(mapping, &pages, NULL, nr_pages, true); - blk_finish_plug(&plug); -} - static struct page *f2fs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) @@ -266,8 +235,8 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - f2fs_merkle_tree_readahead(inode->i_mapping, index, - num_ra_pages); + page_cache_readahead_unbounded(inode->i_mapping, NULL, + index, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c3bf73263ec9..c6348c50136f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -625,6 +625,9 @@ void page_cache_sync_readahead(struct address_space *, struct file_ra_state *, void page_cache_async_readahead(struct address_space *, struct file_ra_state *, struct file *, struct page *, pgoff_t index, unsigned long req_count); +void page_cache_readahead_unbounded(struct address_space *, struct file *, + pgoff_t index, unsigned long nr_to_read, + unsigned long lookahead_count); /* * Like add_to_page_cache_locked, but used to add newly allocated pages: diff --git a/mm/readahead.c b/mm/readahead.c index 998fdd23c0b1..ae231a5312cb 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -156,37 +156,34 @@ out: rac->_index++; } -/* - * __do_page_cache_readahead() actually reads a chunk of disk. It allocates - * the pages first, then submits them for I/O. This avoids the very bad - * behaviour which would occur if page allocations are causing VM writeback. - * We really don't want to intermingle reads and writes like that. +/** + * page_cache_readahead_unbounded - Start unchecked readahead. + * @mapping: File address space. + * @file: This instance of the open file; used for authentication. + * @index: First page index to read. + * @nr_to_read: The number of pages to read. + * @lookahead_size: Where to start the next readahead. + * + * This function is for filesystems to call when they want to start + * readahead beyond a file's stated i_size. This is almost certainly + * not the function you want to call. Use page_cache_async_readahead() + * or page_cache_sync_readahead() instead. + * + * Context: File is referenced by caller. Mutexes may be held by caller. + * May sleep, but will not reenter filesystem to reclaim memory. */ -void __do_page_cache_readahead(struct address_space *mapping, - struct file *filp, pgoff_t index, unsigned long nr_to_read, +void page_cache_readahead_unbounded(struct address_space *mapping, + struct file *file, pgoff_t index, unsigned long nr_to_read, unsigned long lookahead_size) { - struct inode *inode = mapping->host; LIST_HEAD(page_pool); - loff_t isize = i_size_read(inode); gfp_t gfp_mask = readahead_gfp_mask(mapping); struct readahead_control rac = { .mapping = mapping, - .file = filp, + .file = file, ._index = index, }; unsigned long i; - pgoff_t end_index; /* The last page we want to read */ - - if (isize == 0) - return; - - end_index = (isize - 1) >> PAGE_SHIFT; - if (index > end_index) - return; - /* Don't read past the page containing the last byte of the file */ - if (nr_to_read > end_index - index) - nr_to_read = end_index - index + 1; /* * Preallocate as many pages as we will need. @@ -230,6 +227,35 @@ void __do_page_cache_readahead(struct address_space *mapping, */ read_pages(&rac, &page_pool, false); } +EXPORT_SYMBOL_GPL(page_cache_readahead_unbounded); + +/* + * __do_page_cache_readahead() actually reads a chunk of disk. It allocates + * the pages first, then submits them for I/O. This avoids the very bad + * behaviour which would occur if page allocations are causing VM writeback. + * We really don't want to intermingle reads and writes like that. + */ +void __do_page_cache_readahead(struct address_space *mapping, + struct file *file, pgoff_t index, unsigned long nr_to_read, + unsigned long lookahead_size) +{ + struct inode *inode = mapping->host; + loff_t isize = i_size_read(inode); + pgoff_t end_index; /* The last page we want to read */ + + if (isize == 0) + return; + + end_index = (isize - 1) >> PAGE_SHIFT; + if (index > end_index) + return; + /* Don't read past the page containing the last byte of the file */ + if (nr_to_read > end_index - index) + nr_to_read = end_index - index + 1; + + page_cache_readahead_unbounded(mapping, file, index, nr_to_read, + lookahead_size); +} /* * Chunk the readahead into 2 megabyte units, so that we don't pin too much -- cgit v1.2.3 From 2d8163e4899dad92175eedd1c2326c875eaa74fa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:54 -0700 Subject: mm: document why we don't set PageReadahead If the page is already in cache, we don't set PageReadahead on it. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski Cc: Chao Yu Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: John Hubbard Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-15-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/readahead.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index ae231a5312cb..73cb59ed5cff 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -195,9 +195,12 @@ void page_cache_readahead_unbounded(struct address_space *mapping, if (page && !xa_is_value(page)) { /* - * Page already present? Kick off the current batch of - * contiguous pages before continuing with the next - * batch. + * Page already present? Kick off the current batch + * of contiguous pages before continuing with the + * next batch. This page may be the one we would + * have intended to mark as Readahead, but we don't + * have a stable reference to this page, and it's + * not worth getting one just for that. */ read_pages(&rac, &page_pool, true); continue; -- cgit v1.2.3 From f2c817bed58d9be2051fad1d18e167e173c0c227 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:58 -0700 Subject: mm: use memalloc_nofs_save in readahead path Ensure that memory allocations in the readahead path do not attempt to reclaim file-backed pages, which could lead to a deadlock. It is possible, though unlikely this is the root cause of a problem observed by Cong Wang. Reported-by: Cong Wang Suggested-by: Michal Hocko Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: William Kucharski Cc: Chao Yu Cc: Christoph Hellwig Cc: Darrick J. Wong Cc: Dave Chinner Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: John Hubbard Cc: Joseph Qi Cc: Junxiao Bi Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-16-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/readahead.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/mm/readahead.c b/mm/readahead.c index 73cb59ed5cff..3c9a8dd7c56c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "internal.h" @@ -185,6 +186,18 @@ void page_cache_readahead_unbounded(struct address_space *mapping, }; unsigned long i; + /* + * Partway through the readahead operation, we will have added + * locked pages to the page cache, but will not yet have submitted + * them for I/O. Adding another page may need to allocate memory, + * which can trigger memory reclaim. Telling the VM we're in + * the middle of a filesystem operation will cause it to not + * touch file-backed pages, preventing a deadlock. Most (all?) + * filesystems already specify __GFP_NOFS in their mapping's + * gfp_mask, but let's be explicit here. + */ + unsigned int nofs = memalloc_nofs_save(); + /* * Preallocate as many pages as we will need. */ @@ -229,6 +242,7 @@ void page_cache_readahead_unbounded(struct address_space *mapping, * will then handle the error. */ read_pages(&rac, &page_pool, false); + memalloc_nofs_restore(nofs); } EXPORT_SYMBOL_GPL(page_cache_readahead_unbounded); -- cgit v1.2.3 From d4388340ae0bc8397ef5b24342279f7739982918 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:47:02 -0700 Subject: fs: convert mpage_readpages to mpage_readahead Implement the new readahead aop and convert all callers (block_dev, exfat, ext2, fat, gfs2, hpfs, isofs, jfs, nilfs2, ocfs2, omfs, qnx6, reiserfs & udf). The callers are all trivial except for GFS2 & OCFS2. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Junxiao Bi # ocfs2 Reviewed-by: Joseph Qi # ocfs2 Reviewed-by: Dave Chinner Reviewed-by: John Hubbard Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski Cc: Chao Yu Cc: Cong Wang Cc: Darrick J. Wong Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-17-willy@infradead.org Signed-off-by: Linus Torvalds --- fs/block_dev.c | 7 +++---- fs/exfat/inode.c | 7 +++---- fs/ext2/inode.c | 10 ++++------ fs/fat/inode.c | 7 +++---- fs/gfs2/aops.c | 23 ++++++++--------------- fs/hpfs/file.c | 7 +++---- fs/iomap/buffered-io.c | 2 +- fs/isofs/inode.c | 7 +++---- fs/jfs/inode.c | 7 +++---- fs/mpage.c | 38 +++++++++++--------------------------- fs/nilfs2/inode.c | 15 +++------------ fs/ocfs2/aops.c | 34 +++++++++++++--------------------- fs/omfs/file.c | 7 +++---- fs/qnx6/inode.c | 7 +++---- fs/reiserfs/inode.c | 8 +++----- fs/udf/inode.c | 7 +++---- include/linux/mpage.h | 4 ++-- mm/migrate.c | 2 +- 18 files changed, 73 insertions(+), 126 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 93672c3f1c78..f05e2f2c898d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -614,10 +614,9 @@ static int blkdev_readpage(struct file * file, struct page * page) return block_read_full_page(page, blkdev_get_block); } -static int blkdev_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void blkdev_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block); + mpage_readahead(rac, blkdev_get_block); } static int blkdev_write_begin(struct file *file, struct address_space *mapping, @@ -2085,7 +2084,7 @@ static int blkdev_writepages(struct address_space *mapping, static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, - .readpages = blkdev_readpages, + .readahead = blkdev_readahead, .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 06887492f54b..785ead346543 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -372,10 +372,9 @@ static int exfat_readpage(struct file *file, struct page *page) return mpage_readpage(page, exfat_get_block); } -static int exfat_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void exfat_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, exfat_get_block); + mpage_readahead(rac, exfat_get_block); } static int exfat_writepage(struct page *page, struct writeback_control *wbc) @@ -502,7 +501,7 @@ int exfat_block_truncate_page(struct inode *inode, loff_t from) static const struct address_space_operations exfat_aops = { .readpage = exfat_readpage, - .readpages = exfat_readpages, + .readahead = exfat_readahead, .writepage = exfat_writepage, .writepages = exfat_writepages, .write_begin = exfat_write_begin, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index c885cf7d724b..2875c0a705b5 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -877,11 +877,9 @@ static int ext2_readpage(struct file *file, struct page *page) return mpage_readpage(page, ext2_get_block); } -static int -ext2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void ext2_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, ext2_get_block); + mpage_readahead(rac, ext2_get_block); } static int @@ -967,7 +965,7 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc const struct address_space_operations ext2_aops = { .readpage = ext2_readpage, - .readpages = ext2_readpages, + .readahead = ext2_readahead, .writepage = ext2_writepage, .write_begin = ext2_write_begin, .write_end = ext2_write_end, @@ -981,7 +979,7 @@ const struct address_space_operations ext2_aops = { const struct address_space_operations ext2_nobh_aops = { .readpage = ext2_readpage, - .readpages = ext2_readpages, + .readahead = ext2_readahead, .writepage = ext2_nobh_writepage, .write_begin = ext2_nobh_write_begin, .write_end = nobh_write_end, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 71946da84388..e6e68b2274a5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -210,10 +210,9 @@ static int fat_readpage(struct file *file, struct page *page) return mpage_readpage(page, fat_get_block); } -static int fat_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void fat_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, fat_get_block); + mpage_readahead(rac, fat_get_block); } static void fat_write_failed(struct address_space *mapping, loff_t to) @@ -344,7 +343,7 @@ int fat_block_truncate_page(struct inode *inode, loff_t from) static const struct address_space_operations fat_aops = { .readpage = fat_readpage, - .readpages = fat_readpages, + .readahead = fat_readahead, .writepage = fat_writepage, .writepages = fat_writepages, .write_begin = fat_write_begin, diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 786c1ce8f030..72c9560f4467 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -577,7 +577,7 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, } /** - * gfs2_readpages - Read a bunch of pages at once + * gfs2_readahead - Read a bunch of pages at once * @file: The file to read from * @mapping: Address space info * @pages: List of pages to read @@ -590,31 +590,24 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, * obviously not something we'd want to do on too regular a basis. * Any I/O we ignore at this time will be done via readpage later. * 2. We don't handle stuffed files here we let readpage do the honours. - * 3. mpage_readpages() does most of the heavy lifting in the common case. + * 3. mpage_readahead() does most of the heavy lifting in the common case. * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places. */ -static int gfs2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void gfs2_readahead(struct readahead_control *rac) { - struct inode *inode = mapping->host; + struct inode *inode = rac->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_holder gh; - int ret; gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); - ret = gfs2_glock_nq(&gh); - if (unlikely(ret)) + if (gfs2_glock_nq(&gh)) goto out_uninit; if (!gfs2_is_stuffed(ip)) - ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map); + mpage_readahead(rac, gfs2_block_map); gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - if (unlikely(gfs2_withdrawn(sdp))) - ret = -EIO; - return ret; } /** @@ -833,7 +826,7 @@ static const struct address_space_operations gfs2_aops = { .writepage = gfs2_writepage, .writepages = gfs2_writepages, .readpage = gfs2_readpage, - .readpages = gfs2_readpages, + .readahead = gfs2_readahead, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, .releasepage = gfs2_releasepage, @@ -847,7 +840,7 @@ static const struct address_space_operations gfs2_jdata_aops = { .writepage = gfs2_jdata_writepage, .writepages = gfs2_jdata_writepages, .readpage = gfs2_readpage, - .readpages = gfs2_readpages, + .readahead = gfs2_readahead, .set_page_dirty = jdata_set_page_dirty, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index b36abf9cb345..2de0d3492d15 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -125,10 +125,9 @@ static int hpfs_writepage(struct page *page, struct writeback_control *wbc) return block_write_full_page(page, hpfs_get_block, wbc); } -static int hpfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void hpfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, hpfs_get_block); + mpage_readahead(rac, hpfs_get_block); } static int hpfs_writepages(struct address_space *mapping, @@ -198,7 +197,7 @@ static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, const struct address_space_operations hpfs_aops = { .readpage = hpfs_readpage, .writepage = hpfs_writepage, - .readpages = hpfs_readpages, + .readahead = hpfs_readahead, .writepages = hpfs_writepages, .write_begin = hpfs_write_begin, .write_end = hpfs_write_end, diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 89e21961d1ad..075db1e71b14 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -367,7 +367,7 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) } /* - * Just like mpage_readpages and block_read_full_page we always + * Just like mpage_readahead and block_read_full_page we always * return 0 and just mark the page as PageError on errors. This * should be cleaned up all through the stack eventually. */ diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 62c0462dc89f..95b1f377ad09 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1185,10 +1185,9 @@ static int isofs_readpage(struct file *file, struct page *page) return mpage_readpage(page, isofs_get_block); } -static int isofs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void isofs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, isofs_get_block); + mpage_readahead(rac, isofs_get_block); } static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) @@ -1198,7 +1197,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) static const struct address_space_operations isofs_aops = { .readpage = isofs_readpage, - .readpages = isofs_readpages, + .readahead = isofs_readahead, .bmap = _isofs_bmap }; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 9486afcdac76..6f65bfa9f18d 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -296,10 +296,9 @@ static int jfs_readpage(struct file *file, struct page *page) return mpage_readpage(page, jfs_get_block); } -static int jfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void jfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, jfs_get_block); + mpage_readahead(rac, jfs_get_block); } static void jfs_write_failed(struct address_space *mapping, loff_t to) @@ -358,7 +357,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) const struct address_space_operations jfs_aops = { .readpage = jfs_readpage, - .readpages = jfs_readpages, + .readahead = jfs_readahead, .writepage = jfs_writepage, .writepages = jfs_writepages, .write_begin = jfs_write_begin, diff --git a/fs/mpage.c b/fs/mpage.c index ccba3c4c4479..830e6cc2a9e7 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -91,7 +91,7 @@ mpage_alloc(struct block_device *bdev, } /* - * support function for mpage_readpages. The fs supplied get_block might + * support function for mpage_readahead. The fs supplied get_block might * return an up to date buffer. This is used to map that buffer into * the page, which allows readpage to avoid triggering a duplicate call * to get_block. @@ -338,13 +338,8 @@ confused: } /** - * mpage_readpages - populate an address space with some pages & start reads against them - * @mapping: the address_space - * @pages: The address of a list_head which contains the target pages. These - * pages have their ->index populated and are otherwise uninitialised. - * The page at @pages->prev has the lowest file offset, and reads should be - * issued in @pages->prev to @pages->next order. - * @nr_pages: The number of pages at *@pages + * mpage_readahead - start reads against pages + * @rac: Describes which pages to read. * @get_block: The filesystem's block mapper function. * * This function walks the pages and the blocks within each page, building and @@ -381,36 +376,25 @@ confused: * * This all causes the disk requests to be issued in the correct order. */ -int -mpage_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, get_block_t get_block) +void mpage_readahead(struct readahead_control *rac, get_block_t get_block) { + struct page *page; struct mpage_readpage_args args = { .get_block = get_block, .is_readahead = true, }; - unsigned page_idx; - - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = lru_to_page(pages); + while ((page = readahead_page(rac))) { prefetchw(&page->flags); - list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, - page->index, - readahead_gfp_mask(mapping))) { - args.page = page; - args.nr_pages = nr_pages - page_idx; - args.bio = do_mpage_readpage(&args); - } + args.page = page; + args.nr_pages = readahead_count(rac); + args.bio = do_mpage_readpage(&args); put_page(page); } - BUG_ON(!list_empty(pages)); if (args.bio) mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio); - return 0; } -EXPORT_SYMBOL(mpage_readpages); +EXPORT_SYMBOL(mpage_readahead); /* * This isn't called much at all @@ -563,7 +547,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, * Page has buffers, but they are all unmapped. The page was * created by pagein or read over a hole which was handled by * block_read_full_page(). If this address_space is also - * using mpage_readpages then this can rarely happen. + * using mpage_readahead then this can rarely happen. */ goto confused; } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 671085512e0f..ceeb3b441844 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -145,18 +145,9 @@ static int nilfs_readpage(struct file *file, struct page *page) return mpage_readpage(page, nilfs_get_block); } -/** - * nilfs_readpages() - implement readpages() method of nilfs_aops {} - * address_space_operations. - * @file - file struct of the file to be read - * @mapping - address_space struct used for reading multiple pages - * @pages - the pages to be read - * @nr_pages - number of pages to be read - */ -static int nilfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void nilfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block); + mpage_readahead(rac, nilfs_get_block); } static int nilfs_writepages(struct address_space *mapping, @@ -308,7 +299,7 @@ const struct address_space_operations nilfs_aops = { .readpage = nilfs_readpage, .writepages = nilfs_writepages, .set_page_dirty = nilfs_set_page_dirty, - .readpages = nilfs_readpages, + .readahead = nilfs_readahead, .write_begin = nilfs_write_begin, .write_end = nilfs_write_end, /* .releasepage = nilfs_releasepage, */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 3a67a6518ddf..3bfb4147895a 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -350,14 +350,11 @@ out: * grow out to a tree. If need be, detecting boundary extents could * trivially be added in a future version of ocfs2_get_block(). */ -static int ocfs2_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void ocfs2_readahead(struct readahead_control *rac) { - int ret, err = -EIO; - struct inode *inode = mapping->host; + int ret; + struct inode *inode = rac->mapping->host; struct ocfs2_inode_info *oi = OCFS2_I(inode); - loff_t start; - struct page *last; /* * Use the nonblocking flag for the dlm code to avoid page @@ -365,36 +362,31 @@ static int ocfs2_readpages(struct file *filp, struct address_space *mapping, */ ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK); if (ret) - return err; + return; - if (down_read_trylock(&oi->ip_alloc_sem) == 0) { - ocfs2_inode_unlock(inode, 0); - return err; - } + if (down_read_trylock(&oi->ip_alloc_sem) == 0) + goto out_unlock; /* * Don't bother with inline-data. There isn't anything * to read-ahead in that case anyway... */ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) - goto out_unlock; + goto out_up; /* * Check whether a remote node truncated this file - we just * drop out in that case as it's not worth handling here. */ - last = lru_to_page(pages); - start = (loff_t)last->index << PAGE_SHIFT; - if (start >= i_size_read(inode)) - goto out_unlock; + if (readahead_pos(rac) >= i_size_read(inode)) + goto out_up; - err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block); + mpage_readahead(rac, ocfs2_get_block); -out_unlock: +out_up: up_read(&oi->ip_alloc_sem); +out_unlock: ocfs2_inode_unlock(inode, 0); - - return err; } /* Note: Because we don't support holes, our allocation has @@ -2474,7 +2466,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) const struct address_space_operations ocfs2_aops = { .readpage = ocfs2_readpage, - .readpages = ocfs2_readpages, + .readahead = ocfs2_readahead, .writepage = ocfs2_writepage, .write_begin = ocfs2_write_begin, .write_end = ocfs2_write_end, diff --git a/fs/omfs/file.c b/fs/omfs/file.c index d640b9388238..d7b5f09d298c 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -289,10 +289,9 @@ static int omfs_readpage(struct file *file, struct page *page) return block_read_full_page(page, omfs_get_block); } -static int omfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void omfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, omfs_get_block); + mpage_readahead(rac, omfs_get_block); } static int omfs_writepage(struct page *page, struct writeback_control *wbc) @@ -373,7 +372,7 @@ const struct inode_operations omfs_file_inops = { const struct address_space_operations omfs_aops = { .readpage = omfs_readpage, - .readpages = omfs_readpages, + .readahead = omfs_readahead, .writepage = omfs_writepage, .writepages = omfs_writepages, .write_begin = omfs_write_begin, diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 345db56c98fd..755293c8c71a 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -99,10 +99,9 @@ static int qnx6_readpage(struct file *file, struct page *page) return mpage_readpage(page, qnx6_get_block); } -static int qnx6_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void qnx6_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, qnx6_get_block); + mpage_readahead(rac, qnx6_get_block); } /* @@ -499,7 +498,7 @@ static sector_t qnx6_bmap(struct address_space *mapping, sector_t block) } static const struct address_space_operations qnx6_aops = { .readpage = qnx6_readpage, - .readpages = qnx6_readpages, + .readahead = qnx6_readahead, .bmap = qnx6_bmap }; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 6419e6dacc39..0031070b3692 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1160,11 +1160,9 @@ failure: return retval; } -static int -reiserfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void reiserfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block); + mpage_readahead(rac, reiserfs_get_block); } /* @@ -3434,7 +3432,7 @@ out: const struct address_space_operations reiserfs_address_space_operations = { .writepage = reiserfs_writepage, .readpage = reiserfs_readpage, - .readpages = reiserfs_readpages, + .readahead = reiserfs_readahead, .releasepage = reiserfs_releasepage, .invalidatepage = reiserfs_invalidatepage, .write_begin = reiserfs_write_begin, diff --git a/fs/udf/inode.c b/fs/udf/inode.c index e875bc5668ee..adaba8e8b326 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -195,10 +195,9 @@ static int udf_readpage(struct file *file, struct page *page) return mpage_readpage(page, udf_get_block); } -static int udf_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void udf_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, udf_get_block); + mpage_readahead(rac, udf_get_block); } static int udf_write_begin(struct file *file, struct address_space *mapping, @@ -234,7 +233,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block) const struct address_space_operations udf_aops = { .readpage = udf_readpage, - .readpages = udf_readpages, + .readahead = udf_readahead, .writepage = udf_writepage, .writepages = udf_writepages, .write_begin = udf_write_begin, diff --git a/include/linux/mpage.h b/include/linux/mpage.h index 001f1fcf9836..f4f5e90a6844 100644 --- a/include/linux/mpage.h +++ b/include/linux/mpage.h @@ -13,9 +13,9 @@ #ifdef CONFIG_BLOCK struct writeback_control; +struct readahead_control; -int mpage_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, get_block_t get_block); +void mpage_readahead(struct readahead_control *, get_block_t get_block); int mpage_readpage(struct page *page, get_block_t get_block); int mpage_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block); diff --git a/mm/migrate.c b/mm/migrate.c index 7160c1556f79..f66f93f9a5e2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1032,7 +1032,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * to the LRU. Later, when the IO completes the pages are * marked uptodate and unlocked. However, the queueing * could be merging multiple pages for one bio (e.g. - * mpage_readpages). If an allocation happens for the + * mpage_readahead). If an allocation happens for the * second or third page, the process can end up locking * the same page twice and deadlocking. Rather than * trying to be clever about what pages can be locked, -- cgit v1.2.3 From ba206a026ff4cd0f11033ccaa4bf99c30567ded4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:47:05 -0700 Subject: btrfs: convert from readpages to readahead Implement the new readahead method in btrfs using the new readahead_page_batch() function. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: William Kucharski Cc: Chao Yu Cc: Christoph Hellwig Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: John Hubbard Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-18-willy@infradead.org Signed-off-by: Linus Torvalds --- fs/btrfs/extent_io.c | 43 ++++++++++++------------------------------- fs/btrfs/extent_io.h | 3 +-- fs/btrfs/inode.c | 16 +++++++--------- 3 files changed, 20 insertions(+), 42 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 39e45b8a5031..fc46adf2f5bf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4367,51 +4367,32 @@ int extent_writepages(struct address_space *mapping, return ret; } -int extent_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages) +void extent_readahead(struct readahead_control *rac) { struct bio *bio = NULL; unsigned long bio_flags = 0; struct page *pagepool[16]; struct extent_map *em_cached = NULL; - int nr = 0; u64 prev_em_start = (u64)-1; + int nr; - while (!list_empty(pages)) { - u64 contig_end = 0; - - for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) { - struct page *page = lru_to_page(pages); - - prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, page->index, - readahead_gfp_mask(mapping))) { - put_page(page); - break; - } - - pagepool[nr++] = page; - contig_end = page_offset(page) + PAGE_SIZE - 1; - } - - if (nr) { - u64 contig_start = page_offset(pagepool[0]); + while ((nr = readahead_page_batch(rac, pagepool))) { + u64 contig_start = page_offset(pagepool[0]); + u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1; - ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); + ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); - contiguous_readpages(pagepool, nr, contig_start, - contig_end, &em_cached, &bio, &bio_flags, - &prev_em_start); - } + contiguous_readpages(pagepool, nr, contig_start, contig_end, + &em_cached, &bio, &bio_flags, &prev_em_start); } if (em_cached) free_extent_map(em_cached); - if (bio) - return submit_one_bio(bio, 0, bio_flags); - return 0; + if (bio) { + if (submit_one_bio(bio, 0, bio_flags)) + return; + } } /* diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2ed65bd0760e..25594e09fdcd 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -198,8 +198,7 @@ int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); -int extent_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages); +void extent_readahead(struct readahead_control *rac); int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); void set_page_extent_mapped(struct page *page); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 320d1062068d..ba0aa8b4ad09 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4856,8 +4856,8 @@ static void evict_inode_truncate_pages(struct inode *inode) /* * Keep looping until we have no more ranges in the io tree. - * We can have ongoing bios started by readpages (called from readahead) - * that have their endio callback (extent_io.c:end_bio_extent_readpage) + * We can have ongoing bios started by readahead that have + * their endio callback (extent_io.c:end_bio_extent_readpage) * still in progress (unlocked the pages in the bio but did not yet * unlocked the ranges in the io tree). Therefore this means some * ranges can still be locked and eviction started because before @@ -7050,11 +7050,11 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, * for it to complete) and then invalidate the pages for * this range (through invalidate_inode_pages2_range()), * but that can lead us to a deadlock with a concurrent - * call to readpages() (a buffered read or a defrag call + * call to readahead (a buffered read or a defrag call * triggered a readahead) on a page lock due to an * ordered dio extent we created before but did not have * yet a corresponding bio submitted (whence it can not - * complete), which makes readpages() wait for that + * complete), which makes readahead wait for that * ordered extent to complete while holding a lock on * that page. */ @@ -8293,11 +8293,9 @@ static int btrfs_writepages(struct address_space *mapping, return extent_writepages(mapping, wbc); } -static int -btrfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void btrfs_readahead(struct readahead_control *rac) { - return extent_readpages(mapping, pages, nr_pages); + extent_readahead(rac); } static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) @@ -10553,7 +10551,7 @@ static const struct address_space_operations btrfs_aops = { .readpage = btrfs_readpage, .writepage = btrfs_writepage, .writepages = btrfs_writepages, - .readpages = btrfs_readpages, + .readahead = btrfs_readahead, .direct_IO = btrfs_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, -- cgit v1.2.3 From 0c07a9f91ec0367925985944e288993759fb1b07 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:47:09 -0700 Subject: erofs: convert uncompressed files from readpages to readahead Use the new readahead operation in erofs Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: William Kucharski Reviewed-by: Chao Yu Acked-by: Gao Xiang Cc: Christoph Hellwig Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Eric Biggers Cc: Jaegeuk Kim Cc: John Hubbard Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-19-willy@infradead.org Signed-off-by: Linus Torvalds --- fs/erofs/data.c | 39 ++++++++++++++------------------------- fs/erofs/zdata.c | 2 +- include/trace/events/erofs.h | 6 +++--- 3 files changed, 18 insertions(+), 29 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index fc3a8d8064f8..d0542151e8c4 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -280,47 +280,36 @@ static int erofs_raw_access_readpage(struct file *file, struct page *page) return 0; } -static int erofs_raw_access_readpages(struct file *filp, - struct address_space *mapping, - struct list_head *pages, - unsigned int nr_pages) +static void erofs_raw_access_readahead(struct readahead_control *rac) { erofs_off_t last_block; struct bio *bio = NULL; - gfp_t gfp = readahead_gfp_mask(mapping); - struct page *page = list_last_entry(pages, struct page, lru); - - trace_erofs_readpages(mapping->host, page, nr_pages, true); + struct page *page; - for (; nr_pages; --nr_pages) { - page = list_entry(pages->prev, struct page, lru); + trace_erofs_readpages(rac->mapping->host, readahead_index(rac), + readahead_count(rac), true); + while ((page = readahead_page(rac))) { prefetchw(&page->flags); - list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) { - bio = erofs_read_raw_page(bio, mapping, page, - &last_block, nr_pages, true); + bio = erofs_read_raw_page(bio, rac->mapping, page, &last_block, + readahead_count(rac), true); - /* all the page errors are ignored when readahead */ - if (IS_ERR(bio)) { - pr_err("%s, readahead error at page %lu of nid %llu\n", - __func__, page->index, - EROFS_I(mapping->host)->nid); + /* all the page errors are ignored when readahead */ + if (IS_ERR(bio)) { + pr_err("%s, readahead error at page %lu of nid %llu\n", + __func__, page->index, + EROFS_I(rac->mapping->host)->nid); - bio = NULL; - } + bio = NULL; } - /* pages could still be locked */ put_page(page); } - DBG_BUGON(!list_empty(pages)); /* the rare case (end in gaps) */ if (bio) submit_bio(bio); - return 0; } static int erofs_get_block(struct inode *inode, sector_t iblock, @@ -358,7 +347,7 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block) /* for uncompressed (aligned) files and raw access for other files */ const struct address_space_operations erofs_raw_access_aops = { .readpage = erofs_raw_access_readpage, - .readpages = erofs_raw_access_readpages, + .readahead = erofs_raw_access_readahead, .bmap = erofs_bmap, }; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index c4b6c9aa87ec..a78108128af3 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1317,7 +1317,7 @@ static int z_erofs_readpages(struct file *filp, struct address_space *mapping, struct page *head = NULL; LIST_HEAD(pagepool); - trace_erofs_readpages(mapping->host, lru_to_page(pages), + trace_erofs_readpages(mapping->host, lru_to_page(pages)->index, nr_pages, false); f.headoffset = (erofs_off_t)lru_to_page(pages)->index << PAGE_SHIFT; diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index 27f5caa6299a..bf9806fd1306 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -113,10 +113,10 @@ TRACE_EVENT(erofs_readpage, TRACE_EVENT(erofs_readpages, - TP_PROTO(struct inode *inode, struct page *page, unsigned int nrpage, + TP_PROTO(struct inode *inode, pgoff_t start, unsigned int nrpage, bool raw), - TP_ARGS(inode, page, nrpage, raw), + TP_ARGS(inode, start, nrpage, raw), TP_STRUCT__entry( __field(dev_t, dev ) @@ -129,7 +129,7 @@ TRACE_EVENT(erofs_readpages, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; - __entry->start = page->index; + __entry->start = start; __entry->nrpage = nrpage; __entry->raw = raw; ), -- cgit v1.2.3 From 0615090c5044cbf3bd64bfc2c3c968eaf61ab2fd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:47:13 -0700 Subject: erofs: convert compressed files from readpages to readahead Use the new readahead operation in erofs. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Dave Chinner Reviewed-by: William Kucharski Reviewed-by: Chao Yu Acked-by: Gao Xiang Cc: Christoph Hellwig Cc: Cong Wang Cc: Darrick J. Wong Cc: Eric Biggers Cc: Jaegeuk Kim Cc: John Hubbard Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-20-willy@infradead.org Signed-off-by: Linus Torvalds --- fs/erofs/zdata.c | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index a78108128af3..187f93b4900e 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1305,28 +1305,23 @@ static bool should_decompress_synchronously(struct erofs_sb_info *sbi, return nr <= sbi->max_sync_decompress_pages; } -static int z_erofs_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void z_erofs_readahead(struct readahead_control *rac) { - struct inode *const inode = mapping->host; + struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - bool sync = should_decompress_synchronously(sbi, nr_pages); + bool sync = should_decompress_synchronously(sbi, readahead_count(rac)); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); - struct page *head = NULL; + struct page *page, *head = NULL; LIST_HEAD(pagepool); - trace_erofs_readpages(mapping->host, lru_to_page(pages)->index, - nr_pages, false); + trace_erofs_readpages(inode, readahead_index(rac), + readahead_count(rac), false); - f.headoffset = (erofs_off_t)lru_to_page(pages)->index << PAGE_SHIFT; - - for (; nr_pages; --nr_pages) { - struct page *page = lru_to_page(pages); + f.headoffset = readahead_pos(rac); + while ((page = readahead_page(rac))) { prefetchw(&page->flags); - list_del(&page->lru); /* * A pure asynchronous readahead is indicated if @@ -1335,11 +1330,6 @@ static int z_erofs_readpages(struct file *filp, struct address_space *mapping, */ sync &= !(PageReadahead(page) && !head); - if (add_to_page_cache_lru(page, mapping, page->index, gfp)) { - list_add(&page->lru, &pagepool); - continue; - } - set_page_private(page, (unsigned long)head); head = page; } @@ -1368,11 +1358,10 @@ static int z_erofs_readpages(struct file *filp, struct address_space *mapping, /* clean up the remaining free pages */ put_pages_list(&pagepool); - return 0; } const struct address_space_operations z_erofs_aops = { .readpage = z_erofs_readpage, - .readpages = z_erofs_readpages, + .readahead = z_erofs_readahead, }; -- cgit v1.2.3 From 6311f91f76470b50c3f651475c344181adbeb869 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:47:16 -0700 Subject: ext4: convert from readpages to readahead Use the new readahead operation in ext4 Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: William Kucharski Reviewed-by: Eric Biggers Cc: Chao Yu Cc: Christoph Hellwig Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Gao Xiang Cc: Jaegeuk Kim Cc: John Hubbard Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-21-willy@infradead.org Signed-off-by: Linus Torvalds --- fs/ext4/ext4.h | 3 +-- fs/ext4/inode.c | 21 +++++++++------------ fs/ext4/readpage.c | 22 ++++++++-------------- 3 files changed, 18 insertions(+), 28 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3147bb0cf82a..b76e8371a60f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3318,8 +3318,7 @@ static inline void ext4_set_de_type(struct super_block *sb, /* readpages.c */ extern int ext4_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead); + struct readahead_control *rac, struct page *page); extern int __init ext4_init_post_read_processing(void); extern void ext4_exit_post_read_processing(void); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2a4aae6acdcb..cb58ec78c028 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3224,23 +3224,20 @@ static int ext4_readpage(struct file *file, struct page *page) ret = ext4_readpage_inline(inode, page); if (ret == -EAGAIN) - return ext4_mpage_readpages(page->mapping, NULL, page, 1, - false); + return ext4_mpage_readpages(page->mapping, NULL, page); return ret; } -static int -ext4_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void ext4_readahead(struct readahead_control *rac) { - struct inode *inode = mapping->host; + struct inode *inode = rac->mapping->host; - /* If the file has inline data, no need to do readpages. */ + /* If the file has inline data, no need to do readahead. */ if (ext4_has_inline_data(inode)) - return 0; + return; - return ext4_mpage_readpages(mapping, pages, NULL, nr_pages, true); + ext4_mpage_readpages(rac->mapping, rac, NULL); } static void ext4_invalidatepage(struct page *page, unsigned int offset, @@ -3605,7 +3602,7 @@ static int ext4_set_page_dirty(struct page *page) static const struct address_space_operations ext4_aops = { .readpage = ext4_readpage, - .readpages = ext4_readpages, + .readahead = ext4_readahead, .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_write_begin, @@ -3622,7 +3619,7 @@ static const struct address_space_operations ext4_aops = { static const struct address_space_operations ext4_journalled_aops = { .readpage = ext4_readpage, - .readpages = ext4_readpages, + .readahead = ext4_readahead, .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_write_begin, @@ -3638,7 +3635,7 @@ static const struct address_space_operations ext4_journalled_aops = { static const struct address_space_operations ext4_da_aops = { .readpage = ext4_readpage, - .readpages = ext4_readpages, + .readahead = ext4_readahead, .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index c1769afbf799..66275f25235d 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -7,8 +7,8 @@ * * This was originally taken from fs/mpage.c * - * The intent is the ext4_mpage_readpages() function here is intended - * to replace mpage_readpages() in the general case, not just for + * The ext4_mpage_readpages() function here is intended to + * replace mpage_readahead() in the general case, not just for * encrypted files. It has some limitations (see below), where it * will fall back to read_block_full_page(), but these limitations * should only be hit when page_size != block_size. @@ -222,8 +222,7 @@ static inline loff_t ext4_readpage_limit(struct inode *inode) } int ext4_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead) + struct readahead_control *rac, struct page *page) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; @@ -241,6 +240,7 @@ int ext4_mpage_readpages(struct address_space *mapping, int length; unsigned relative_block = 0; struct ext4_map_blocks map; + unsigned int nr_pages = rac ? readahead_count(rac) : 1; map.m_pblk = 0; map.m_lblk = 0; @@ -251,14 +251,9 @@ int ext4_mpage_readpages(struct address_space *mapping, int fully_mapped = 1; unsigned first_hole = blocks_per_page; - if (pages) { - page = lru_to_page(pages); - + if (rac) { + page = readahead_page(rac); prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, page->index, - readahead_gfp_mask(mapping))) - goto next_page; } if (page_has_buffers(page)) @@ -381,7 +376,7 @@ int ext4_mpage_readpages(struct address_space *mapping, bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; bio_set_op_attrs(bio, REQ_OP_READ, - is_readahead ? REQ_RAHEAD : 0); + rac ? REQ_RAHEAD : 0); } length = first_hole << blkbits; @@ -406,10 +401,9 @@ int ext4_mpage_readpages(struct address_space *mapping, else unlock_page(page); next_page: - if (pages) + if (rac) put_page(page); } - BUG_ON(pages && !list_empty(pages)); if (bio) submit_bio(bio); return 0; -- cgit v1.2.3 From a07f624bd69a2c1c455364ffdc751a534554b241 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:47:20 -0700 Subject: ext4: pass the inode to ext4_mpage_readpages This function now only uses the mapping argument to look up the inode, and both callers already have the inode, so just pass the inode instead of the mapping. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: William Kucharski Reviewed-by: Eric Biggers Cc: Chao Yu Cc: Christoph Hellwig Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Gao Xiang Cc: Jaegeuk Kim Cc: John Hubbard Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-22-willy@infradead.org Signed-off-by: Linus Torvalds --- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 4 ++-- fs/ext4/readpage.c | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b76e8371a60f..15b062efcff1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3317,7 +3317,7 @@ static inline void ext4_set_de_type(struct super_block *sb, } /* readpages.c */ -extern int ext4_mpage_readpages(struct address_space *mapping, +extern int ext4_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct page *page); extern int __init ext4_init_post_read_processing(void); extern void ext4_exit_post_read_processing(void); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cb58ec78c028..52be85f96159 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3224,7 +3224,7 @@ static int ext4_readpage(struct file *file, struct page *page) ret = ext4_readpage_inline(inode, page); if (ret == -EAGAIN) - return ext4_mpage_readpages(page->mapping, NULL, page); + return ext4_mpage_readpages(inode, NULL, page); return ret; } @@ -3237,7 +3237,7 @@ static void ext4_readahead(struct readahead_control *rac) if (ext4_has_inline_data(inode)) return; - ext4_mpage_readpages(rac->mapping, rac, NULL); + ext4_mpage_readpages(inode, rac, NULL); } static void ext4_invalidatepage(struct page *page, unsigned int offset, diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 66275f25235d..5761e9961682 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -221,13 +221,12 @@ static inline loff_t ext4_readpage_limit(struct inode *inode) return i_size_read(inode); } -int ext4_mpage_readpages(struct address_space *mapping, +int ext4_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct page *page) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; - struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; -- cgit v1.2.3 From 2332319625cc5c703f79d185ac9a53db20913748 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:47:23 -0700 Subject: f2fs: convert from readpages to readahead Use the new readahead operation in f2fs Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: William Kucharski Reviewed-by: Eric Biggers Reviewed-by: Chao Yu Acked-by: Jaegeuk Kim Cc: Christoph Hellwig Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Gao Xiang Cc: John Hubbard Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-23-willy@infradead.org Signed-off-by: Linus Torvalds --- fs/f2fs/data.c | 47 ++++++++++++++++++--------------------------- include/trace/events/f2fs.h | 6 +++--- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ae14e952df4f..5b72945bf9f1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2178,8 +2178,7 @@ out: * from read-ahead. */ static int f2fs_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead) + struct readahead_control *rac, struct page *page) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; @@ -2197,6 +2196,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, .nr_cpages = 0, }; #endif + unsigned nr_pages = rac ? readahead_count(rac) : 1; unsigned max_nr_pages = nr_pages; int ret = 0; @@ -2210,15 +2210,9 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_may_create = false; for (; nr_pages; nr_pages--) { - if (pages) { - page = list_last_entry(pages, struct page, lru); - + if (rac) { + page = readahead_page(rac); prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, - page_index(page), - readahead_gfp_mask(mapping))) - goto next_page; } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -2228,7 +2222,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, - is_readahead, false); + rac != NULL, false); f2fs_destroy_compress_ctx(&cc); if (ret) goto set_error_page; @@ -2251,7 +2245,7 @@ read_single_page: #endif ret = f2fs_read_single_page(inode, page, max_nr_pages, &map, - &bio, &last_block_in_bio, is_readahead); + &bio, &last_block_in_bio, rac); if (ret) { #ifdef CONFIG_F2FS_FS_COMPRESSION set_error_page: @@ -2260,8 +2254,10 @@ set_error_page: zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); } +#ifdef CONFIG_F2FS_FS_COMPRESSION next_page: - if (pages) +#endif + if (rac) put_page(page); #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -2271,16 +2267,15 @@ next_page: ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, - is_readahead, false); + rac != NULL, false); f2fs_destroy_compress_ctx(&cc); } } #endif } - BUG_ON(pages && !list_empty(pages)); if (bio) __submit_bio(F2FS_I_SB(inode), bio, DATA); - return pages ? 0 : ret; + return ret; } static int f2fs_read_data_page(struct file *file, struct page *page) @@ -2299,28 +2294,24 @@ static int f2fs_read_data_page(struct file *file, struct page *page) if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); if (ret == -EAGAIN) - ret = f2fs_mpage_readpages(page_file_mapping(page), - NULL, page, 1, false); + ret = f2fs_mpage_readpages(page_file_mapping(page), NULL, page); return ret; } -static int f2fs_read_data_pages(struct file *file, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void f2fs_readahead(struct readahead_control *rac) { - struct inode *inode = mapping->host; - struct page *page = list_last_entry(pages, struct page, lru); + struct inode *inode = rac->mapping->host; - trace_f2fs_readpages(inode, page, nr_pages); + trace_f2fs_readpages(inode, readahead_index(rac), readahead_count(rac)); if (!f2fs_is_compress_backend_ready(inode)) - return 0; + return; /* If the file has inline data, skip readpages */ if (f2fs_has_inline_data(inode)) - return 0; + return; - return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages, true); + f2fs_mpage_readpages(rac->mapping, rac, NULL); } int f2fs_encrypt_one_page(struct f2fs_io_info *fio) @@ -3805,7 +3796,7 @@ static void f2fs_swap_deactivate(struct file *file) const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, - .readpages = f2fs_read_data_pages, + .readahead = f2fs_readahead, .writepage = f2fs_write_data_page, .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index d97adfc327f0..24c2557c37f0 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1376,9 +1376,9 @@ TRACE_EVENT(f2fs_writepages, TRACE_EVENT(f2fs_readpages, - TP_PROTO(struct inode *inode, struct page *page, unsigned int nrpage), + TP_PROTO(struct inode *inode, pgoff_t start, unsigned int nrpage), - TP_ARGS(inode, page, nrpage), + TP_ARGS(inode, start, nrpage), TP_STRUCT__entry( __field(dev_t, dev) @@ -1390,7 +1390,7 @@ TRACE_EVENT(f2fs_readpages, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->start = page->index; + __entry->start = start; __entry->nrpage = nrpage; ), -- cgit v1.2.3 From e20a7693644ebf6f8005d8cdc8c8ece49bb70253 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:47:27 -0700 Subject: f2fs: pass the inode to f2fs_mpage_readpages This function now only uses the mapping argument to look up the inode, and both callers already have the inode, so just pass the inode instead of the mapping. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: William Kucharski Reviewed-by: Eric Biggers Reviewed-by: Chao Yu Acked-by: Jaegeuk Kim Cc: Christoph Hellwig Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Gao Xiang Cc: John Hubbard Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-24-willy@infradead.org Signed-off-by: Linus Torvalds --- fs/f2fs/data.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5b72945bf9f1..03ec97f28235 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2177,12 +2177,11 @@ out: * use ->readpage() or do the necessary surgery to decouple ->readpages() * from read-ahead. */ -static int f2fs_mpage_readpages(struct address_space *mapping, +static int f2fs_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct page *page) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; - struct inode *inode = mapping->host; struct f2fs_map_blocks map; #ifdef CONFIG_F2FS_FS_COMPRESSION struct compress_ctx cc = { @@ -2294,7 +2293,7 @@ static int f2fs_read_data_page(struct file *file, struct page *page) if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); if (ret == -EAGAIN) - ret = f2fs_mpage_readpages(page_file_mapping(page), NULL, page); + ret = f2fs_mpage_readpages(inode, NULL, page); return ret; } @@ -2311,7 +2310,7 @@ static void f2fs_readahead(struct readahead_control *rac) if (f2fs_has_inline_data(inode)) return; - f2fs_mpage_readpages(rac->mapping, rac, NULL); + f2fs_mpage_readpages(inode, rac, NULL); } int f2fs_encrypt_one_page(struct f2fs_io_info *fio) -- cgit v1.2.3 From 76a0294eb19b5f909b119500e60d72ef41cc4d8e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:47:31 -0700 Subject: fuse: convert from readpages to readahead Implement the new readahead operation in fuse by using __readahead_batch() to fill the array of pages in fuse_args_pages directly. This lets us inline fuse_readpages_fill() into fuse_readahead(). [willy@infradead.org: build fix] Link: http://lkml.kernel.org/r/20200415025938.GB5820@bombadil.infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Dave Chinner Reviewed-by: William Kucharski Acked-by: Miklos Szeredi Cc: Chao Yu Cc: Christoph Hellwig Cc: Cong Wang Cc: Darrick J. Wong Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: John Hubbard Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Link: http://lkml.kernel.org/r/20200414150233.24495-25-willy@infradead.org Signed-off-by: Linus Torvalds --- fs/fuse/file.c | 100 ++++++++++++++++----------------------------------------- 1 file changed, 28 insertions(+), 72 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9d67b830fb7a..bac51c32d660 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -915,84 +915,40 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) fuse_readpages_end(fc, &ap->args, err); } -struct fuse_fill_data { - struct fuse_io_args *ia; - struct file *file; - struct inode *inode; - unsigned int nr_pages; - unsigned int max_pages; -}; - -static int fuse_readpages_fill(void *_data, struct page *page) +static void fuse_readahead(struct readahead_control *rac) { - struct fuse_fill_data *data = _data; - struct fuse_io_args *ia = data->ia; - struct fuse_args_pages *ap = &ia->ap; - struct inode *inode = data->inode; + struct inode *inode = rac->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); + unsigned int i, max_pages, nr_pages = 0; - fuse_wait_on_page_writeback(inode, page->index); - - if (ap->num_pages && - (ap->num_pages == fc->max_pages || - (ap->num_pages + 1) * PAGE_SIZE > fc->max_read || - ap->pages[ap->num_pages - 1]->index + 1 != page->index)) { - data->max_pages = min_t(unsigned int, data->nr_pages, - fc->max_pages); - fuse_send_readpages(ia, data->file); - data->ia = ia = fuse_io_alloc(NULL, data->max_pages); - if (!ia) { - unlock_page(page); - return -ENOMEM; - } - ap = &ia->ap; - } - - if (WARN_ON(ap->num_pages >= data->max_pages)) { - unlock_page(page); - fuse_io_free(ia); - return -EIO; - } - - get_page(page); - ap->pages[ap->num_pages] = page; - ap->descs[ap->num_pages].length = PAGE_SIZE; - ap->num_pages++; - data->nr_pages--; - return 0; -} - -static int fuse_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) -{ - struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_fill_data data; - int err; - - err = -EIO; if (is_bad_inode(inode)) - goto out; + return; - data.file = file; - data.inode = inode; - data.nr_pages = nr_pages; - data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages); -; - data.ia = fuse_io_alloc(NULL, data.max_pages); - err = -ENOMEM; - if (!data.ia) - goto out; + max_pages = min_t(unsigned int, fc->max_pages, + fc->max_read / PAGE_SIZE); - err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); - if (!err) { - if (data.ia->ap.num_pages) - fuse_send_readpages(data.ia, file); - else - fuse_io_free(data.ia); + for (;;) { + struct fuse_io_args *ia; + struct fuse_args_pages *ap; + + nr_pages = readahead_count(rac) - nr_pages; + if (nr_pages > max_pages) + nr_pages = max_pages; + if (nr_pages == 0) + break; + ia = fuse_io_alloc(NULL, nr_pages); + if (!ia) + return; + ap = &ia->ap; + nr_pages = __readahead_batch(rac, ap->pages, nr_pages); + for (i = 0; i < nr_pages; i++) { + fuse_wait_on_page_writeback(inode, + readahead_index(rac) + i); + ap->descs[i].length = PAGE_SIZE; + } + ap->num_pages = nr_pages; + fuse_send_readpages(ia, rac->file); } -out: - return err; } static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -3373,10 +3329,10 @@ static const struct file_operations fuse_file_operations = { static const struct address_space_operations fuse_file_aops = { .readpage = fuse_readpage, + .readahead = fuse_readahead, .writepage = fuse_writepage, .writepages = fuse_writepages, .launder_page = fuse_launder_page, - .readpages = fuse_readpages, .set_page_dirty = __set_page_dirty_nobuffers, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, -- cgit v1.2.3 From 9d24a13a93d995e4c980fdaa389aa3e2f1ea0b12 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:47:34 -0700 Subject: iomap: convert from readpages to readahead Use the new readahead operation in iomap. Convert XFS and ZoneFS to use it. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: William Kucharski Cc: Chao Yu Cc: Cong Wang Cc: Dave Chinner Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: John Hubbard Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-26-willy@infradead.org Signed-off-by: Linus Torvalds --- fs/iomap/buffered-io.c | 90 +++++++++++++++++--------------------------------- fs/iomap/trace.h | 2 +- fs/xfs/xfs_aops.c | 13 +++----- fs/zonefs/super.c | 7 ++-- include/linux/iomap.h | 3 +- 5 files changed, 41 insertions(+), 74 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 075db1e71b14..890c8fcda4f3 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -214,9 +214,8 @@ iomap_read_end_io(struct bio *bio) struct iomap_readpage_ctx { struct page *cur_page; bool cur_page_in_bio; - bool is_readahead; struct bio *bio; - struct list_head *pages; + struct readahead_control *rac; }; static void @@ -308,7 +307,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (ctx->bio) submit_bio(ctx->bio); - if (ctx->is_readahead) /* same as readahead_gfp_mask */ + if (ctx->rac) /* same as readahead_gfp_mask */ gfp |= __GFP_NORETRY | __GFP_NOWARN; ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs)); /* @@ -319,7 +318,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (!ctx->bio) ctx->bio = bio_alloc(orig_gfp, 1); ctx->bio->bi_opf = REQ_OP_READ; - if (ctx->is_readahead) + if (ctx->rac) ctx->bio->bi_opf |= REQ_RAHEAD; ctx->bio->bi_iter.bi_sector = sector; bio_set_dev(ctx->bio, iomap->bdev); @@ -375,36 +374,8 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_readpage); -static struct page * -iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, - loff_t length, loff_t *done) -{ - while (!list_empty(pages)) { - struct page *page = lru_to_page(pages); - - if (page_offset(page) >= (u64)pos + length) - break; - - list_del(&page->lru); - if (!add_to_page_cache_lru(page, inode->i_mapping, page->index, - GFP_NOFS)) - return page; - - /* - * If we already have a page in the page cache at index we are - * done. Upper layers don't care if it is uptodate after the - * readpages call itself as every page gets checked again once - * actually needed. - */ - *done += PAGE_SIZE; - put_page(page); - } - - return NULL; -} - static loff_t -iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, +iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap, struct iomap *srcmap) { struct iomap_readpage_ctx *ctx = data; @@ -418,10 +389,7 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, ctx->cur_page = NULL; } if (!ctx->cur_page) { - ctx->cur_page = iomap_next_page(inode, ctx->pages, - pos, length, &done); - if (!ctx->cur_page) - break; + ctx->cur_page = readahead_page(ctx->rac); ctx->cur_page_in_bio = false; } ret = iomap_readpage_actor(inode, pos + done, length - done, @@ -431,32 +399,43 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, return done; } -int -iomap_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, const struct iomap_ops *ops) +/** + * iomap_readahead - Attempt to read pages from a file. + * @rac: Describes the pages to be read. + * @ops: The operations vector for the filesystem. + * + * This function is for filesystems to call to implement their readahead + * address_space operation. + * + * Context: The @ops callbacks may submit I/O (eg to read the addresses of + * blocks from disc), and may wait for it. The caller may be trying to + * access a different page, and so sleeping excessively should be avoided. + * It may allocate memory, but should avoid costly allocations. This + * function is called with memalloc_nofs set, so allocations will not cause + * the filesystem to be reentered. + */ +void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) { + struct inode *inode = rac->mapping->host; + loff_t pos = readahead_pos(rac); + loff_t length = readahead_length(rac); struct iomap_readpage_ctx ctx = { - .pages = pages, - .is_readahead = true, + .rac = rac, }; - loff_t pos = page_offset(list_entry(pages->prev, struct page, lru)); - loff_t last = page_offset(list_entry(pages->next, struct page, lru)); - loff_t length = last - pos + PAGE_SIZE, ret = 0; - trace_iomap_readpages(mapping->host, nr_pages); + trace_iomap_readahead(inode, readahead_count(rac)); while (length > 0) { - ret = iomap_apply(mapping->host, pos, length, 0, ops, - &ctx, iomap_readpages_actor); + loff_t ret = iomap_apply(inode, pos, length, 0, ops, + &ctx, iomap_readahead_actor); if (ret <= 0) { WARN_ON_ONCE(ret == 0); - goto done; + break; } pos += ret; length -= ret; } - ret = 0; -done: + if (ctx.bio) submit_bio(ctx.bio); if (ctx.cur_page) { @@ -464,15 +443,8 @@ done: unlock_page(ctx.cur_page); put_page(ctx.cur_page); } - - /* - * Check that we didn't lose a page due to the arcance calling - * conventions.. - */ - WARN_ON_ONCE(!ret && !list_empty(ctx.pages)); - return ret; } -EXPORT_SYMBOL_GPL(iomap_readpages); +EXPORT_SYMBOL_GPL(iomap_readahead); /* * iomap_is_partially_uptodate checks whether blocks within a page are diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 4df19c66f597..5693a39d52fb 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -39,7 +39,7 @@ DEFINE_EVENT(iomap_readpage_class, name, \ TP_PROTO(struct inode *inode, int nr_pages), \ TP_ARGS(inode, nr_pages)) DEFINE_READPAGE_EVENT(iomap_readpage); -DEFINE_READPAGE_EVENT(iomap_readpages); +DEFINE_READPAGE_EVENT(iomap_readahead); DECLARE_EVENT_CLASS(iomap_range_class, TP_PROTO(struct inode *inode, unsigned long off, unsigned int len), diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 9d9cebf18726..1fd4fb7a607c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -621,14 +621,11 @@ xfs_vm_readpage( return iomap_readpage(page, &xfs_read_iomap_ops); } -STATIC int -xfs_vm_readpages( - struct file *unused, - struct address_space *mapping, - struct list_head *pages, - unsigned nr_pages) +STATIC void +xfs_vm_readahead( + struct readahead_control *rac) { - return iomap_readpages(mapping, pages, nr_pages, &xfs_read_iomap_ops); + iomap_readahead(rac, &xfs_read_iomap_ops); } static int @@ -644,7 +641,7 @@ xfs_iomap_swapfile_activate( const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, - .readpages = xfs_vm_readpages, + .readahead = xfs_vm_readahead, .writepage = xfs_vm_writepage, .writepages = xfs_vm_writepages, .set_page_dirty = iomap_set_page_dirty, diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 3ce9829a6936..dba874a61fc5 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -78,10 +78,9 @@ static int zonefs_readpage(struct file *unused, struct page *page) return iomap_readpage(page, &zonefs_iomap_ops); } -static int zonefs_readpages(struct file *unused, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void zonefs_readahead(struct readahead_control *rac) { - return iomap_readpages(mapping, pages, nr_pages, &zonefs_iomap_ops); + iomap_readahead(rac, &zonefs_iomap_ops); } /* @@ -128,7 +127,7 @@ static int zonefs_writepages(struct address_space *mapping, static const struct address_space_operations zonefs_file_aops = { .readpage = zonefs_readpage, - .readpages = zonefs_readpages, + .readahead = zonefs_readahead, .writepage = zonefs_writepage, .writepages = zonefs_writepages, .set_page_dirty = iomap_set_page_dirty, diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 8b09463dae0d..bc20bd04c2a2 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -155,8 +155,7 @@ loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops); int iomap_readpage(struct page *page, const struct iomap_ops *ops); -int iomap_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, const struct iomap_ops *ops); +void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); int iomap_set_page_dirty(struct page *page); int iomap_is_partially_uptodate(struct page *page, unsigned long from, unsigned long count); -- cgit v1.2.3 From b03143accd9274d9e024da42ed5857a71a6b6a27 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 1 Jun 2020 21:47:38 -0700 Subject: include/linux/pagemap.h: introduce attach/detach_page_private MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Introduce attach/detach_page_private to cleanup code". This patch (of 10): The logic in attach_page_buffers and __clear_page_buffers are quite paired, but 1. they are located in different files. 2. attach_page_buffers is implemented in buffer_head.h, so it could be used by other files. But __clear_page_buffers is static function in buffer.c and other potential users can't call the function, md-bitmap even copied the function. So, introduce the new attach/detach_page_private to replace them. With the new pair of function, we will remove the usage of attach_page_buffers and __clear_page_buffers in next patches. Thanks for suggestions about the function name from Alexander Viro, Andreas Grünbacher, Christoph Hellwig and Matthew Wilcox. Suggested-by: Matthew Wilcox Signed-off-by: Guoqing Jiang Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: "Darrick J. Wong" Cc: William Kucharski Cc: "Kirill A. Shutemov" Cc: Andreas Gruenbacher Cc: Yang Shi Cc: Yafang Shao Cc: Song Liu Cc: Chris Mason Cc: Josef Bacik Cc: David Sterba Cc: Alexander Viro Cc: Jaegeuk Kim Cc: Chao Yu Cc: Christoph Hellwig Cc: Anton Altaparmakov Cc: Mike Marshall Cc: Martin Brandenburg Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Roman Gushchin Cc: Andreas Dilger Cc: Chao Yu Cc: Dave Chinner Link: http://lkml.kernel.org/r/20200517214718.468-1-guoqing.jiang@cloud.ionos.com Link: http://lkml.kernel.org/r/20200517214718.468-2-guoqing.jiang@cloud.ionos.com Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c6348c50136f..8e085713150c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -208,6 +208,43 @@ static inline int page_cache_add_speculative(struct page *page, int count) return __page_cache_add_speculative(page, count); } +/** + * attach_page_private - Attach private data to a page. + * @page: Page to attach data to. + * @data: Data to attach to page. + * + * Attaching private data to a page increments the page's reference count. + * The data must be detached before the page will be freed. + */ +static inline void attach_page_private(struct page *page, void *data) +{ + get_page(page); + set_page_private(page, (unsigned long)data); + SetPagePrivate(page); +} + +/** + * detach_page_private - Detach private data from a page. + * @page: Page to detach data from. + * + * Removes the data that was previously attached to the page and decrements + * the refcount on the page. + * + * Return: Data that was attached to the page. + */ +static inline void *detach_page_private(struct page *page) +{ + void *data = (void *)page_private(page); + + if (!PagePrivate(page)) + return NULL; + ClearPagePrivate(page); + set_page_private(page, 0); + put_page(page); + + return data; +} + #ifdef CONFIG_NUMA extern struct page *__page_cache_alloc(gfp_t gfp); #else -- cgit v1.2.3 From db2c1d86cc93b8f2674c1e032bbf8ff0cbafe122 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 1 Jun 2020 21:47:42 -0700 Subject: md: remove __clear_page_buffers and use attach/detach_page_private After introduction attach/detach_page_private in pagemap.h, we can remove the duplicated code and call the new functions. Signed-off-by: Guoqing Jiang Signed-off-by: Andrew Morton Acked-by: Song Liu Link: http://lkml.kernel.org/r/20200517214718.468-3-guoqing.jiang@cloud.ionos.com Signed-off-by: Linus Torvalds --- drivers/md/md-bitmap.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index b952bd45bd6a..95a5f3757fa3 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -324,14 +324,6 @@ static void end_bitmap_write(struct buffer_head *bh, int uptodate) wake_up(&bitmap->write_wait); } -/* copied from buffer.c */ -static void -__clear_page_buffers(struct page *page) -{ - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); -} static void free_buffers(struct page *page) { struct buffer_head *bh; @@ -345,7 +337,7 @@ static void free_buffers(struct page *page) free_buffer_head(bh); bh = next; } - __clear_page_buffers(page); + detach_page_private(page); put_page(page); } @@ -374,7 +366,7 @@ static int read_page(struct file *file, unsigned long index, ret = -ENOMEM; goto out; } - attach_page_buffers(page, bh); + attach_page_private(page, bh); blk_cur = index << (PAGE_SHIFT - inode->i_blkbits); while (bh) { block = blk_cur; -- cgit v1.2.3 From d1b89bc0426110138ddc80a6916f359d0174571d Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 1 Jun 2020 21:47:45 -0700 Subject: btrfs: use attach/detach_page_private Since the new pair function is introduced, we can call them to clean the code in btrfs. Signed-off-by: Guoqing Jiang Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Acked-by: David Sterba Cc: Chris Mason Cc: Josef Bacik Link: http://lkml.kernel.org/r/20200517214718.468-4-guoqing.jiang@cloud.ionos.com Signed-off-by: Linus Torvalds --- fs/btrfs/disk-io.c | 4 +--- fs/btrfs/extent_io.c | 21 ++++++--------------- fs/btrfs/inode.c | 23 +++++------------------ 3 files changed, 12 insertions(+), 36 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d10c7be10f3b..7278789ff8a7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -980,9 +980,7 @@ static void btree_invalidatepage(struct page *page, unsigned int offset, btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info, "page private not zero on page %llu", (unsigned long long)page_offset(page)); - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); + detach_page_private(page); } } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fc46adf2f5bf..e12eb32d9e17 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3076,22 +3076,16 @@ static int submit_extent_page(unsigned int opf, static void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) { - if (!PagePrivate(page)) { - SetPagePrivate(page); - get_page(page); - set_page_private(page, (unsigned long)eb); - } else { + if (!PagePrivate(page)) + attach_page_private(page, eb); + else WARN_ON(page->private != (unsigned long)eb); - } } void set_page_extent_mapped(struct page *page) { - if (!PagePrivate(page)) { - SetPagePrivate(page); - get_page(page); - set_page_private(page, EXTENT_PAGE_PRIVATE); - } + if (!PagePrivate(page)) + attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); } static struct extent_map * @@ -4910,10 +4904,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) * We need to make sure we haven't be attached * to a new eb. */ - ClearPagePrivate(page); - set_page_private(page, 0); - /* One for the page private */ - put_page(page); + detach_page_private(page); } if (mapped) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ba0aa8b4ad09..8b3489f229c7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8301,11 +8301,8 @@ static void btrfs_readahead(struct readahead_control *rac) static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { int ret = try_release_extent_mapping(page, gfp_flags); - if (ret == 1) { - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - } + if (ret == 1) + detach_page_private(page); return ret; } @@ -8327,14 +8324,8 @@ static int btrfs_migratepage(struct address_space *mapping, if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (page_has_private(page)) { - ClearPagePrivate(page); - get_page(newpage); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - SetPagePrivate(newpage); - } + if (page_has_private(page)) + attach_page_private(newpage, detach_page_private(page)); if (PagePrivate2(page)) { ClearPagePrivate2(page); @@ -8456,11 +8447,7 @@ again: } ClearPageChecked(page); - if (PagePrivate(page)) { - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - } + detach_page_private(page); } /* -- cgit v1.2.3 From 45dcfc27329f6b0e78ea7023ea7fb8c856d70129 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 1 Jun 2020 21:47:48 -0700 Subject: fs/buffer.c: use attach/detach_page_private Since the new pair function is introduced, we can call them to clean the code in buffer.c. Signed-off-by: Guoqing Jiang Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Alexander Viro Link: http://lkml.kernel.org/r/20200517214718.468-5-guoqing.jiang@cloud.ionos.com Signed-off-by: Linus Torvalds --- fs/buffer.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 15f25170615a..64fe82ec65ff 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -123,14 +123,6 @@ void __wait_on_buffer(struct buffer_head * bh) } EXPORT_SYMBOL(__wait_on_buffer); -static void -__clear_page_buffers(struct page *page) -{ - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); -} - static void buffer_io_error(struct buffer_head *bh, char *msg) { if (!test_bit(BH_Quiet, &bh->b_state)) @@ -906,7 +898,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head) bh = bh->b_this_page; } while (bh); tail->b_this_page = head; - attach_page_buffers(page, head); + attach_page_private(page, head); } static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) @@ -1587,7 +1579,7 @@ void create_empty_buffers(struct page *page, bh = bh->b_this_page; } while (bh != head); } - attach_page_buffers(page, head); + attach_page_private(page, head); spin_unlock(&page->mapping->private_lock); } EXPORT_SYMBOL(create_empty_buffers); @@ -2574,7 +2566,7 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head) bh->b_this_page = head; bh = bh->b_this_page; } while (bh != head); - attach_page_buffers(page, head); + attach_page_private(page, head); spin_unlock(&page->mapping->private_lock); } @@ -3234,7 +3226,7 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free) bh = next; } while (bh != head); *buffers_to_free = head; - __clear_page_buffers(page); + detach_page_private(page); return 1; failed: return 0; -- cgit v1.2.3 From 7128cf9a25096a75139ec5dcd710ede182657210 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 1 Jun 2020 21:47:51 -0700 Subject: f2fs: use attach/detach_page_private Since the new pair function is introduced, we can call them to clean the code in f2fs.h. Signed-off-by: Guoqing Jiang Signed-off-by: Andrew Morton Acked-by: Chao Yu Cc: Jaegeuk Kim Link: http://lkml.kernel.org/r/20200517214718.468-6-guoqing.jiang@cloud.ionos.com Signed-off-by: Linus Torvalds --- fs/f2fs/f2fs.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b5a5da74c013..5c0149d2f46a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3051,19 +3051,12 @@ static inline void f2fs_set_page_private(struct page *page, if (PagePrivate(page)) return; - get_page(page); - SetPagePrivate(page); - set_page_private(page, data); + attach_page_private(page, (void *)data); } static inline void f2fs_clear_page_private(struct page *page) { - if (!PagePrivate(page)) - return; - - set_page_private(page, 0); - ClearPagePrivate(page); - f2fs_put_page(page, 0); + detach_page_private(page); } /* -- cgit v1.2.3 From 58aeb731963cb4cf4db35412723b5084799dbe06 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 1 Jun 2020 21:47:54 -0700 Subject: iomap: use attach/detach_page_private Since the new pair function is introduced, we can call them to clean the code in iomap. Signed-off-by: Guoqing Jiang Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Reviewed-by: Darrick J. Wong Cc: Christoph Hellwig Cc: Dave Chinner Link: http://lkml.kernel.org/r/20200517214718.468-7-guoqing.jiang@cloud.ionos.com Signed-off-by: Linus Torvalds --- fs/iomap/buffered-io.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 890c8fcda4f3..a1ed7620fbac 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -59,24 +59,19 @@ iomap_page_create(struct inode *inode, struct page *page) * migrate_page_move_mapping() assumes that pages with private data have * their count elevated by 1. */ - get_page(page); - set_page_private(page, (unsigned long)iop); - SetPagePrivate(page); + attach_page_private(page, iop); return iop; } static void iomap_page_release(struct page *page) { - struct iomap_page *iop = to_iomap_page(page); + struct iomap_page *iop = detach_page_private(page); if (!iop) return; WARN_ON_ONCE(atomic_read(&iop->read_count)); WARN_ON_ONCE(atomic_read(&iop->write_count)); - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); kfree(iop); } @@ -526,14 +521,8 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage, if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (page_has_private(page)) { - ClearPagePrivate(page); - get_page(newpage); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - SetPagePrivate(newpage); - } + if (page_has_private(page)) + attach_page_private(newpage, detach_page_private(page)); if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); -- cgit v1.2.3 From 14ed109e3f3daabb5689b32176a3b2194ae1b609 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 1 Jun 2020 21:47:57 -0700 Subject: ntfs: replace attach_page_buffers with attach_page_private Call the new function since attach_page_buffers will be removed. Signed-off-by: Guoqing Jiang Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Anton Altaparmakov Link: http://lkml.kernel.org/r/20200517214718.468-8-guoqing.jiang@cloud.ionos.com Signed-off-by: Linus Torvalds --- fs/ntfs/aops.c | 2 +- fs/ntfs/mft.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 554b744f41bf..bb0a43860ad2 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1732,7 +1732,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { bh = bh->b_this_page; } while (bh); tail->b_this_page = head; - attach_page_buffers(page, head); + attach_page_private(page, head); } else buffers_to_free = bh; } diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 3aac5c917afe..fbb9f1bc623d 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -504,7 +504,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, bh = bh->b_this_page; } while (bh); tail->b_this_page = head; - attach_page_buffers(page, head); + attach_page_private(page, head); } bh = head = page_buffers(page); BUG_ON(!bh); -- cgit v1.2.3 From 4c42be38c28837017248beb46012b6081f45cb38 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 1 Jun 2020 21:48:00 -0700 Subject: orangefs: use attach/detach_page_private Since the new pair function is introduced, we can call them to clean the code in orangefs. Signed-off-by: Guoqing Jiang Signed-off-by: Andrew Morton Tested-by: Mike Marshall Reviewed-by: Andrew Morton Cc: Martin Brandenburg Link: http://lkml.kernel.org/r/20200517214718.468-9-guoqing.jiang@cloud.ionos.com Signed-off-by: Linus Torvalds --- fs/orangefs/inode.c | 32 ++++++-------------------------- 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 12ae630fbed7..48f0547d4850 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -62,12 +62,7 @@ static int orangefs_writepage_locked(struct page *page, } else { ret = 0; } - if (wr) { - kfree(wr); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); - } + kfree(detach_page_private(page)); return ret; } @@ -409,9 +404,7 @@ static int orangefs_write_begin(struct file *file, wr->len = len; wr->uid = current_fsuid(); wr->gid = current_fsgid(); - SetPagePrivate(page); - set_page_private(page, (unsigned long)wr); - get_page(page); + attach_page_private(page, wr); okay: return 0; } @@ -459,18 +452,12 @@ static void orangefs_invalidatepage(struct page *page, wr = (struct orangefs_write_range *)page_private(page); if (offset == 0 && length == PAGE_SIZE) { - kfree((struct orangefs_write_range *)page_private(page)); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); + kfree(detach_page_private(page)); return; /* write range entirely within invalidate range (or equal) */ } else if (page_offset(page) + offset <= wr->pos && wr->pos + wr->len <= page_offset(page) + offset + length) { - kfree((struct orangefs_write_range *)page_private(page)); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); + kfree(detach_page_private(page)); /* XXX is this right? only caller in fs */ cancel_dirty_page(page); return; @@ -535,12 +522,7 @@ static int orangefs_releasepage(struct page *page, gfp_t foo) static void orangefs_freepage(struct page *page) { - if (PagePrivate(page)) { - kfree((struct orangefs_write_range *)page_private(page)); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); - } + kfree(detach_page_private(page)); } static int orangefs_launder_page(struct page *page) @@ -740,9 +722,7 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) wr->len = PAGE_SIZE; wr->uid = current_fsuid(); wr->gid = current_fsgid(); - SetPagePrivate(page); - set_page_private(page, (unsigned long)wr); - get_page(page); + attach_page_private(page, wr); okay: file_update_time(vmf->vma->vm_file); -- cgit v1.2.3 From 7b59435a2afed12dc9b2ec1b930efa2e94f1c397 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 1 Jun 2020 21:48:03 -0700 Subject: buffer_head.h: remove attach_page_buffers All the callers have replaced attach_page_buffers with the new function attach_page_private, so remove it. Signed-off-by: Guoqing Jiang Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Roman Gushchin Cc: Andreas Dilger Link: http://lkml.kernel.org/r/20200517214718.468-10-guoqing.jiang@cloud.ionos.com Signed-off-by: Linus Torvalds --- include/linux/buffer_head.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 15b765a181b8..22fb11e2d2e0 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -272,14 +272,6 @@ void buffer_init(void); * inline definitions */ -static inline void attach_page_buffers(struct page *page, - struct buffer_head *head) -{ - get_page(page); - SetPagePrivate(page); - set_page_private(page, (unsigned long)head); -} - static inline void get_bh(struct buffer_head *bh) { atomic_inc(&bh->b_count); -- cgit v1.2.3 From cd0f37154443844256709f736754b1bace5b24d8 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 1 Jun 2020 21:48:06 -0700 Subject: mm/migrate.c: call detach_page_private to cleanup code We can cleanup code a little by call detach_page_private here. [akpm@linux-foundation.org: use attach_page_private(), per Dave] http://lkml.kernel.org/r/20200521225220.GV2005@dread.disaster.area [akpm@linux-foundation.org: clear PagePrivate] Signed-off-by: Guoqing Jiang Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Chao Yu Cc: Cong Wang Cc: Dave Chinner Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: John Hubbard Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200519214049.15179-1-guoqing.jiang@cloud.ionos.com Signed-off-by: Linus Torvalds --- mm/migrate.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index f66f93f9a5e2..fb425d86c115 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -797,10 +797,7 @@ recheck_buffers: if (rc != MIGRATEPAGE_SUCCESS) goto unlock_buffers; - ClearPagePrivate(page); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); + attach_page_private(newpage, detach_page_private(page)); get_page(newpage); bh = head; @@ -810,8 +807,6 @@ recheck_buffers: } while (bh != head); - SetPagePrivate(newpage); - if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); else -- cgit v1.2.3 From 60e65a6f42d0af33fa7361bd8723adc70539121b Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 1 Jun 2020 21:48:09 -0700 Subject: mm_types.h: change set_page_private to inline function Change it to inline function to make callers use the proper argument. And no need for it to be macro per Andrew's comment [1]. [1] https://lore.kernel.org/lkml/20200518221235.1fa32c38e5766113f78e3f0d@linux-foundation.org/ Signed-off-by: Guoqing Jiang Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: http://lkml.kernel.org/r/20200525203149.18802-1-guoqing.jiang@cloud.ionos.com Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4aba6c0c2ba8..ef6d3aface8a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -240,7 +240,11 @@ static inline atomic_t *compound_pincount_ptr(struct page *page) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) #define page_private(page) ((page)->private) -#define set_page_private(page, v) ((page)->private = (v)) + +static inline void set_page_private(struct page *page, unsigned long private) +{ + page->private = private; +} struct page_frag_cache { void * va; -- cgit v1.2.3 From 74f4c89d81e57613848043fe0d38caf9547b0324 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:48:12 -0700 Subject: mm/filemap.c: remove misleading comment We no longer return 0 here and the comment doesn't tell us anything that we don't already know (SIGBUS is a pretty good indicator that things didn't work out). Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Reviewed-by: William Kucharski Link: http://lkml.kernel.org/r/20200529123243.20640-1-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/filemap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 23a051a7ef0f..fe079e9219d1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2566,7 +2566,6 @@ page_not_uptodate: if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; - /* Things didn't work out. Return zero to tell the mm layer so. */ shrink_readahead_size_eio(ra); return VM_FAULT_SIGBUS; -- cgit v1.2.3 From 28659cc8cc8766707b1b50906df38bd94dcf349b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 1 Jun 2020 21:48:15 -0700 Subject: mm/page-writeback.c: remove unused variable Commit 64081362e8ff ("mm/page-writeback.c: fix range_cyclic writeback vs writepages deadlock") left unused variable, remove it. Signed-off-by: Chao Yu Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Link: http://lkml.kernel.org/r/20200528033740.17269-1-yuchao0@huawei.com Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7326b54ab728..2df6fb174983 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2164,7 +2164,6 @@ int write_cache_pages(struct address_space *mapping, int error; struct pagevec pvec; int nr_pages; - pgoff_t uninitialized_var(writeback_index); pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; @@ -2173,8 +2172,7 @@ int write_cache_pages(struct address_space *mapping, pagevec_init(&pvec); if (wbc->range_cyclic) { - writeback_index = mapping->writeback_index; /* prev offset */ - index = writeback_index; + index = mapping->writeback_index; /* prev offset */ end = -1; } else { index = wbc->range_start >> PAGE_SHIFT; -- cgit v1.2.3 From a37b0715ddf3007734c4e2424c14bc7efcdd1190 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 1 Jun 2020 21:48:18 -0700 Subject: mm/writeback: replace PF_LESS_THROTTLE with PF_LOCAL_THROTTLE PF_LESS_THROTTLE exists for loop-back nfsd (and a similar need in the loop block driver and callers of prctl(PR_SET_IO_FLUSHER)), where a daemon needs to write to one bdi (the final bdi) in order to free up writes queued to another bdi (the client bdi). The daemon sets PF_LESS_THROTTLE and gets a larger allowance of dirty pages, so that it can still dirty pages after other processses have been throttled. The purpose of this is to avoid deadlock that happen when the PF_LESS_THROTTLE process must write for any dirty pages to be freed, but it is being thottled and cannot write. This approach was designed when all threads were blocked equally, independently on which device they were writing to, or how fast it was. Since that time the writeback algorithm has changed substantially with different threads getting different allowances based on non-trivial heuristics. This means the simple "add 25%" heuristic is no longer reliable. The important issue is not that the daemon needs a *larger* dirty page allowance, but that it needs a *private* dirty page allowance, so that dirty pages for the "client" bdi that it is helping to clear (the bdi for an NFS filesystem or loop block device etc) do not affect the throttling of the daemon writing to the "final" bdi. This patch changes the heuristic so that the task is not throttled when the bdi it is writing to has a dirty page count below below (or equal to) the free-run threshold for that bdi. This ensures it will always be able to have some pages in flight, and so will not deadlock. In a steady-state, it is expected that PF_LOCAL_THROTTLE tasks might still be throttled by global threshold, but that is acceptable as it is only the deadlock state that is interesting for this flag. This approach of "only throttle when target bdi is busy" is consistent with the other use of PF_LESS_THROTTLE in current_may_throttle(), were it causes attention to be focussed only on the target bdi. So this patch - renames PF_LESS_THROTTLE to PF_LOCAL_THROTTLE, - removes the 25% bonus that that flag gives, and - If PF_LOCAL_THROTTLE is set, don't delay at all unless the global and the local free-run thresholds are exceeded. Note that previously realtime threads were treated the same as PF_LESS_THROTTLE threads. This patch does *not* change the behvaiour for real-time threads, so it is now different from the behaviour of nfsd and loop tasks. I don't know what is wanted for realtime. [akpm@linux-foundation.org: coding style fixes] Signed-off-by: NeilBrown Signed-off-by: Andrew Morton Reviewed-by: Jan Kara Acked-by: Chuck Lever [nfsd] Cc: Christoph Hellwig Cc: Michal Hocko Cc: Trond Myklebust Link: http://lkml.kernel.org/r/87ftbf7gs3.fsf@notabene.neil.brown.name Signed-off-by: Linus Torvalds --- drivers/block/loop.c | 2 +- fs/nfsd/vfs.c | 9 +++++---- include/linux/sched.h | 3 ++- kernel/sys.c | 2 +- mm/page-writeback.c | 41 +++++++++++++++++++++++++++++++++-------- mm/vmscan.c | 4 ++-- 6 files changed, 44 insertions(+), 17 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index da693e6a834e..d89c25ba3b89 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -919,7 +919,7 @@ static void loop_unprepare_queue(struct loop_device *lo) static int loop_kthread_worker_fn(void *worker_ptr) { - current->flags |= PF_LESS_THROTTLE | PF_MEMALLOC_NOIO; + current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; return kthread_worker_fn(worker_ptr); } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 0aa02eb18bd3..c3fbab1753ec 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -979,12 +979,13 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) /* - * We want less throttling in balance_dirty_pages() - * and shrink_inactive_list() so that nfs to + * We want throttling in balance_dirty_pages() + * and shrink_inactive_list() to only consider + * the backingdev we are writing to, so that nfs to * localhost doesn't cause nfsd to lock up due to all * the client's dirty pages or its congested queue. */ - current->flags |= PF_LESS_THROTTLE; + current->flags |= PF_LOCAL_THROTTLE; exp = fhp->fh_export; use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); @@ -1037,7 +1038,7 @@ out_nfserr: nfserr = nfserrno(host_err); } if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) - current_restore_flags(pflags, PF_LESS_THROTTLE); + current_restore_flags(pflags, PF_LOCAL_THROTTLE); return nfserr; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 4418f5cb8324..12ef0c753284 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1481,7 +1481,8 @@ extern struct pid *cad_pid; #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ -#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ +#define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, + * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ diff --git a/kernel/sys.c b/kernel/sys.c index d325f3ab624a..180a2fa33f7f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2262,7 +2262,7 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, return -EINVAL; } -#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LESS_THROTTLE) +#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2df6fb174983..7ff2290cf43d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -387,8 +387,7 @@ static unsigned long global_dirtyable_memory(void) * Calculate @dtc->thresh and ->bg_thresh considering * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller * must ensure that @dtc->avail is set before calling this function. The - * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and - * real-time tasks. + * dirty limits will be lifted by 1/4 for real-time tasks. */ static void domain_dirty_limits(struct dirty_throttle_control *dtc) { @@ -436,7 +435,7 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) if (bg_thresh >= thresh) bg_thresh = thresh / 2; tsk = current; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { + if (rt_task(tsk)) { bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; } @@ -486,7 +485,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat) else dirty = vm_dirty_ratio * node_memory / 100; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) + if (rt_task(tsk)) dirty += dirty / 4; return dirty; @@ -1653,8 +1652,12 @@ static void balance_dirty_pages(struct bdi_writeback *wb, if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && (!mdtc || m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { - unsigned long intv = dirty_poll_interval(dirty, thresh); - unsigned long m_intv = ULONG_MAX; + unsigned long intv; + unsigned long m_intv; + +free_running: + intv = dirty_poll_interval(dirty, thresh); + m_intv = ULONG_MAX; current->dirty_paused_when = now; current->nr_dirtied = 0; @@ -1673,9 +1676,20 @@ static void balance_dirty_pages(struct bdi_writeback *wb, * Calculate global domain's pos_ratio and select the * global dtc by default. */ - if (!strictlimit) + if (!strictlimit) { wb_dirty_limits(gdtc); + if ((current->flags & PF_LOCAL_THROTTLE) && + gdtc->wb_dirty < + dirty_freerun_ceiling(gdtc->wb_thresh, + gdtc->wb_bg_thresh)) + /* + * LOCAL_THROTTLE tasks must not be throttled + * when below the per-wb freerun ceiling. + */ + goto free_running; + } + dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && ((gdtc->dirty > gdtc->thresh) || strictlimit); @@ -1689,9 +1703,20 @@ static void balance_dirty_pages(struct bdi_writeback *wb, * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ - if (!strictlimit) + if (!strictlimit) { wb_dirty_limits(mdtc); + if ((current->flags & PF_LOCAL_THROTTLE) && + mdtc->wb_dirty < + dirty_freerun_ceiling(mdtc->wb_thresh, + mdtc->wb_bg_thresh)) + /* + * LOCAL_THROTTLE tasks must not be + * throttled when below the per-wb + * freerun ceiling. + */ + goto free_running; + } dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && ((mdtc->dirty > mdtc->thresh) || strictlimit); diff --git a/mm/vmscan.c b/mm/vmscan.c index a37c87b5aee2..b2f5deb3603c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1878,13 +1878,13 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, /* * If a kernel thread (such as nfsd for loop-back mounts) services - * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. + * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE. * In that case we should only throttle if the backing device it is * writing to is congested. In other cases it is safe to throttle. */ static int current_may_throttle(void) { - return !(current->flags & PF_LESS_THROTTLE) || + return !(current->flags & PF_LOCAL_THROTTLE) || current->backing_dev_info == NULL || bdi_write_congested(current->backing_dev_info); } -- cgit v1.2.3 From 8d92890bd6b8502d6aee4b37430ae6444ade7a8c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 1 Jun 2020 21:48:21 -0700 Subject: mm/writeback: discard NR_UNSTABLE_NFS, use NR_WRITEBACK instead After an NFS page has been written it is considered "unstable" until a COMMIT request succeeds. If the COMMIT fails, the page will be re-written. These "unstable" pages are currently accounted as "reclaimable", either in WB_RECLAIMABLE, or in NR_UNSTABLE_NFS which is included in a 'reclaimable' count. This might have made sense when sending the COMMIT required a separate action by the VFS/MM (e.g. releasepage() used to send a COMMIT). However now that all writes generated by ->writepages() will automatically be followed by a COMMIT (since commit 919e3bd9a875 ("NFS: Ensure we commit after writeback is complete")) it makes more sense to treat them as writeback pages. So this patch removes NR_UNSTABLE_NFS and accounts unstable pages in NR_WRITEBACK and WB_WRITEBACK. A particular effect of this change is that when wb_check_background_flush() calls wb_over_bg_threshold(), the latter will report 'true' a lot less often as the 'unstable' pages are no longer considered 'dirty' (as there is nothing that writeback can do about them anyway). Currently wb_check_background_flush() will trigger writeback to NFS even when there are relatively few dirty pages (if there are lots of unstable pages), this can result in small writes going to the server (10s of Kilobytes rather than a Megabyte) which hurts throughput. With this patch, there are fewer writes which are each larger on average. Where the NR_UNSTABLE_NFS count was included in statistics virtual-files, the entry is retained, but the value is hard-coded as zero. static trace points and warning printks which mentioned this counter no longer report it. [akpm@linux-foundation.org: re-layout comment] [akpm@linux-foundation.org: fix printk warning] Signed-off-by: NeilBrown Signed-off-by: Andrew Morton Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Acked-by: Trond Myklebust Acked-by: Michal Hocko [mm] Cc: Christoph Hellwig Cc: Chuck Lever Link: http://lkml.kernel.org/r/87d06j7gqa.fsf@notabene.neil.brown.name Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.rst | 4 ++-- drivers/base/node.c | 2 +- fs/fs-writeback.c | 1 - fs/nfs/internal.h | 10 +++++++--- fs/nfs/write.c | 4 ++-- fs/proc/meminfo.c | 3 +-- include/linux/mmzone.h | 1 - include/trace/events/writeback.h | 5 +---- mm/memcontrol.c | 1 - mm/page-writeback.c | 17 ++++------------- mm/page_alloc.c | 5 +---- mm/vmstat.c | 11 +++++++++-- 12 files changed, 28 insertions(+), 36 deletions(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 38b606991065..092b7b44d158 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -1042,8 +1042,8 @@ PageTables amount of memory dedicated to the lowest level of page tables. NFS_Unstable - NFS pages sent to the server, but not yet committed to stable - storage + Always zero. Previous counted pages which had been written to + the server, but has not been committed to stable storage. Bounce Memory used for block device "bounce buffers" WritebackTmp diff --git a/drivers/base/node.c b/drivers/base/node.c index 10d7e818e118..6012574913f7 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -439,7 +439,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(i.sharedram), nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB), nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)), - nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)), + nid, 0UL, nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), nid, K(sreclaimable + diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 76ac9c7d32ec..c5bdf46e3b4b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1070,7 +1070,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, static unsigned long get_nr_dirty_pages(void) { return global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS) + get_nr_dirty_inodes(); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 1f32a9fbfdaf..6673a77884d9 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -668,7 +668,8 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) } /* - * Record the page as unstable and mark its inode as dirty. + * Record the page as unstable (an extra writeback period) and mark its + * inode as dirty. */ static inline void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) @@ -676,8 +677,11 @@ void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) if (!cinfo->dreq) { struct inode *inode = page_file_mapping(page)->host; - inc_node_page_state(page, NR_UNSTABLE_NFS); - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); + /* This page is really still in write-back - just that the + * writeback is happening on the server now. + */ + inc_node_page_state(page, NR_WRITEBACK); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1e767f779c49..639c34fec04a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -946,9 +946,9 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, static void nfs_clear_page_commit(struct page *page) { - dec_node_page_state(page, NR_UNSTABLE_NFS); + dec_node_page_state(page, NR_WRITEBACK); dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, - WB_RECLAIMABLE); + WB_WRITEBACK); } /* Called holding the request lock on @req */ diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8c1f1bb1a5ce..9bd94b5a9658 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -106,8 +106,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "PageTables: ", global_zone_page_state(NR_PAGETABLE)); - show_val_kb(m, "NFS_Unstable: ", - global_node_page_state(NR_UNSTABLE_NFS)); + show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", global_zone_page_state(NR_BOUNCE)); show_val_kb(m, "WritebackTmp: ", diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1b9de7d220fb..a89f47515eb1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -193,7 +193,6 @@ enum node_stat_item { NR_FILE_THPS, NR_FILE_PMDMAPPED, NR_ANON_THPS, - NR_UNSTABLE_NFS, /* NFS unstable pages */ NR_VMSCAN_WRITE, NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 85a33bea76f1..10f5d1fa7347 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -541,7 +541,6 @@ TRACE_EVENT(global_dirty_state, TP_STRUCT__entry( __field(unsigned long, nr_dirty) __field(unsigned long, nr_writeback) - __field(unsigned long, nr_unstable) __field(unsigned long, background_thresh) __field(unsigned long, dirty_thresh) __field(unsigned long, dirty_limit) @@ -552,7 +551,6 @@ TRACE_EVENT(global_dirty_state, TP_fast_assign( __entry->nr_dirty = global_node_page_state(NR_FILE_DIRTY); __entry->nr_writeback = global_node_page_state(NR_WRITEBACK); - __entry->nr_unstable = global_node_page_state(NR_UNSTABLE_NFS); __entry->nr_dirtied = global_node_page_state(NR_DIRTIED); __entry->nr_written = global_node_page_state(NR_WRITTEN); __entry->background_thresh = background_thresh; @@ -560,12 +558,11 @@ TRACE_EVENT(global_dirty_state, __entry->dirty_limit = global_wb_domain.dirty_limit; ), - TP_printk("dirty=%lu writeback=%lu unstable=%lu " + TP_printk("dirty=%lu writeback=%lu " "bg_thresh=%lu thresh=%lu limit=%lu " "dirtied=%lu written=%lu", __entry->nr_dirty, __entry->nr_writeback, - __entry->nr_unstable, __entry->background_thresh, __entry->dirty_thresh, __entry->dirty_limit, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a3b97f103966..1db4b285c407 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4330,7 +4330,6 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); - /* this should eventually include NR_UNSTABLE_NFS */ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) + memcg_exact_page_state(memcg, NR_ACTIVE_FILE); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7ff2290cf43d..718565266257 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -504,7 +504,6 @@ bool node_dirty_ok(struct pglist_data *pgdat) unsigned long nr_pages = 0; nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); - nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS); nr_pages += node_page_state(pgdat, NR_WRITEBACK); return nr_pages <= limit; @@ -758,7 +757,7 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. * * Return: @wb's dirty limit in pages. The term "dirty" in the context of - * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. + * dirty balancing includes all PG_dirty and PG_writeback pages. */ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) { @@ -1566,7 +1565,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; - unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ + unsigned long nr_reclaimable; /* = file_dirty */ long period; long pause; long max_pause; @@ -1586,14 +1585,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long m_thresh = 0; unsigned long m_bg_thresh = 0; - /* - * Unstable writes are a feature of certain networked - * filesystems (i.e. NFS) in which data may have been - * written to the server's write cache, but has not yet - * been flushed to permanent storage. - */ - nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS); + nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); gdtc->avail = global_dirtyable_memory(); gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); @@ -1963,8 +1955,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) * as we're trying to decide whether to put more under writeback. */ gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS); + gdtc->dirty = global_node_page_state(NR_FILE_DIRTY); domain_dirty_limits(gdtc); if (gdtc->dirty > gdtc->bg_thresh) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 13cc653122b7..cc406ee17ad9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5319,7 +5319,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" " active_file:%lu inactive_file:%lu isolated_file:%lu\n" - " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" + " unevictable:%lu dirty:%lu writeback:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", @@ -5332,7 +5332,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_UNEVICTABLE), global_node_page_state(NR_FILE_DIRTY), global_node_page_state(NR_WRITEBACK), - global_node_page_state(NR_UNSTABLE_NFS), global_node_page_state(NR_SLAB_RECLAIMABLE), global_node_page_state(NR_SLAB_UNRECLAIMABLE), global_node_page_state(NR_FILE_MAPPED), @@ -5365,7 +5364,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " anon_thp: %lukB" #endif " writeback_tmp:%lukB" - " unstable:%lukB" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -5387,7 +5385,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), - K(node_page_state(pgdat, NR_UNSTABLE_NFS)), pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? "yes" : "no"); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 96d21a792b57..b1582fdf757c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1108,7 +1108,7 @@ int fragmentation_index(struct zone *zone, unsigned int order) TEXT_FOR_HIGHMEM(xx) xx "_movable", const char * const vmstat_text[] = { - /* enum zone_stat_item countes */ + /* enum zone_stat_item counters */ "nr_free_pages", "nr_zone_inactive_anon", "nr_zone_active_anon", @@ -1162,7 +1162,6 @@ const char * const vmstat_text[] = { "nr_file_hugepages", "nr_file_pmdmapped", "nr_anon_transparent_hugepages", - "nr_unstable", "nr_vmscan_write", "nr_vmscan_immediate_reclaim", "nr_dirtied", @@ -1723,6 +1722,14 @@ static int vmstat_show(struct seq_file *m, void *arg) seq_puts(m, vmstat_text[off]); seq_put_decimal_ull(m, " ", *l); seq_putc(m, '\n'); + + if (off == NR_VMSTAT_ITEMS - 1) { + /* + * We've come to the end - add any deprecated counters to avoid + * breaking userspace which might depend on them being present. + */ + seq_puts(m, "nr_unstable 0\n"); + } return 0; } -- cgit v1.2.3 From adc8cb406e52f89071f488848b0fb3e7907ae332 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 1 Jun 2020 21:48:24 -0700 Subject: mm/gup.c: update the documentation This patch is an attempt to update the documentation. - Add/ remove extra * based on type of function static/global. - Add description for functions and their input arguments. [akpm@linux-foundation.org: s@/*@/**@] Signed-off-by: Souptick Joarder Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: http://lkml.kernel.org/r/1588013630-4497-1-git-send-email-jrdr.linux@gmail.com Signed-off-by: Linus Torvalds --- mm/gup.c | 57 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 87a6a59fe667..ca723a33268e 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1168,7 +1168,7 @@ static bool vma_permits_fault(struct vm_area_struct *vma, return true; } -/* +/** * fixup_user_fault() - manually resolve a user page fault * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. @@ -1839,7 +1839,7 @@ static long __get_user_pages_remote(struct task_struct *tsk, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } -/* +/** * get_user_pages_remote() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. @@ -1870,13 +1870,13 @@ static long __get_user_pages_remote(struct task_struct *tsk, * * Must be called with mmap_sem held for read or write. * - * get_user_pages walks a process's page tables and takes a reference to - * each struct page that each user address corresponds to at a given + * get_user_pages_remote walks a process's page tables and takes a reference + * to each struct page that each user address corresponds to at a given * instant. That is, it takes the page that would be accessed if a user * thread accesses the given user virtual address at that instant. * * This does not guarantee that the page exists in the user mappings when - * get_user_pages returns, and there may even be a completely different + * get_user_pages_remote returns, and there may even be a completely different * page there in some cases (eg. if mmapped pagecache has been invalidated * and subsequently re faulted). However it does guarantee that the page * won't be freed completely. And mostly callers simply care that the page @@ -1888,17 +1888,17 @@ static long __get_user_pages_remote(struct task_struct *tsk, * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must * be called after the page is finished with, and before put_page is called. * - * get_user_pages is typically used for fewer-copy IO operations, to get a - * handle on the memory by some means other than accesses via the user virtual - * addresses. The pages may be submitted for DMA to devices or accessed via - * their kernel linear mapping (via the kmap APIs). Care should be taken to - * use the correct cache flushing APIs. + * get_user_pages_remote is typically used for fewer-copy IO operations, + * to get a handle on the memory by some means other than accesses + * via the user virtual addresses. The pages may be submitted for + * DMA to devices or accessed via their kernel linear mapping (via the + * kmap APIs). Care should be taken to use the correct cache flushing APIs. * * See also get_user_pages_fast, for performance critical applications. * - * get_user_pages should be phased out in favor of + * get_user_pages_remote should be phased out in favor of * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing - * should use get_user_pages because it cannot pass + * should use get_user_pages_remote because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, @@ -1937,7 +1937,17 @@ static long __get_user_pages_remote(struct task_struct *tsk, } #endif /* !CONFIG_MMU */ -/* +/** + * get_user_pages() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * * This is the same as get_user_pages_remote(), just with a * less-flexible calling convention where we assume that the task * and mm being operated on are the current task's and don't allow @@ -1960,11 +1970,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages); -/* - * We can leverage the VM_FAULT_RETRY functionality in the page fault - * paths better by using either get_user_pages_locked() or - * get_user_pages_unlocked(). - * +/** * get_user_pages_locked() is suitable to replace the form: * * down_read(&mm->mmap_sem); @@ -1980,6 +1986,21 @@ EXPORT_SYMBOL(get_user_pages); * get_user_pages_locked(tsk, mm, ..., pages, &locked); * if (locked) * up_read(&mm->mmap_sem); + * + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. + * + * We can leverage the VM_FAULT_RETRY functionality in the page fault + * paths better by using either get_user_pages_locked() or + * get_user_pages_unlocked(). + * */ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, -- cgit v1.2.3 From 91429023342789a89f4b6ae95b47a7df71ab6d95 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 1 Jun 2020 21:48:27 -0700 Subject: mm/gup: introduce pin_user_pages_unlocked Introduce pin_user_pages_unlocked(), which is nearly identical to the get_user_pages_unlocked() that it wraps, except that it sets FOLL_PIN and rejects FOLL_GET. Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Andy Walls Cc: Mauro Carvalho Chehab Link: http://lkml.kernel.org/r/20200518012157.1178336-2-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/gup.c | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 92704fde6475..ebbb0acbeee2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1713,6 +1713,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); +long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); diff --git a/mm/gup.c b/mm/gup.c index ca723a33268e..c3014669988b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2960,3 +2960,20 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, pages, vmas, gup_flags); } EXPORT_SYMBOL(pin_user_pages); + +/* + * pin_user_pages_unlocked() is the FOLL_PIN variant of + * get_user_pages_unlocked(). Behavior is the same, except that this one sets + * FOLL_PIN and rejects FOLL_GET. + */ +long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, + struct page **pages, unsigned int gup_flags) +{ + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); +} +EXPORT_SYMBOL(pin_user_pages_unlocked); -- cgit v1.2.3 From e792031019bdc3f1eb5cac5a4171f7ffb8586c1b Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 1 Jun 2020 21:48:30 -0700 Subject: ivtv: convert get_user_pages() --> pin_user_pages() This code was using get_user_pages*(), in a "Case 2" scenario (DMA/RDMA), using the categorization from [1]. That means that it's time to convert the get_user_pages*() + put_page() calls to pin_user_pages*() + unpin_user_pages() calls. There is some helpful background in [2]: basically, this is a small part of fixing a long-standing disconnect between pinning pages, and file systems' use of those pages. [1] Documentation/core-api/pin_user_pages.rst [2] "Explicit pinning of user-space pages": https://lwn.net/Articles/807108/ Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Cc: Andy Walls Cc: Mauro Carvalho Chehab Link: http://lkml.kernel.org/r/20200518012157.1178336-3-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- drivers/media/pci/ivtv/ivtv-udma.c | 19 ++++++------------- drivers/media/pci/ivtv/ivtv-yuv.c | 17 ++++++----------- drivers/media/pci/ivtv/ivtvfb.c | 4 ++-- 3 files changed, 14 insertions(+), 26 deletions(-) diff --git a/drivers/media/pci/ivtv/ivtv-udma.c b/drivers/media/pci/ivtv/ivtv-udma.c index 5f8883031c9c..0d8372cc364a 100644 --- a/drivers/media/pci/ivtv/ivtv-udma.c +++ b/drivers/media/pci/ivtv/ivtv-udma.c @@ -92,7 +92,7 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr, { struct ivtv_dma_page_info user_dma; struct ivtv_user_dma *dma = &itv->udma; - int i, err; + int err; IVTV_DEBUG_DMA("ivtv_udma_setup, dst: 0x%08x\n", (unsigned int)ivtv_dest_addr); @@ -111,16 +111,15 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr, return -EINVAL; } - /* Get user pages for DMA Xfer */ - err = get_user_pages_unlocked(user_dma.uaddr, user_dma.page_count, + /* Pin user pages for DMA Xfer */ + err = pin_user_pages_unlocked(user_dma.uaddr, user_dma.page_count, dma->map, FOLL_FORCE); if (user_dma.page_count != err) { IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n", err, user_dma.page_count); if (err >= 0) { - for (i = 0; i < err; i++) - put_page(dma->map[i]); + unpin_user_pages(dma->map, err); return -EINVAL; } return err; @@ -130,9 +129,7 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr, /* Fill SG List with new values */ if (ivtv_udma_fill_sg_list(dma, &user_dma, 0) < 0) { - for (i = 0; i < dma->page_count; i++) { - put_page(dma->map[i]); - } + unpin_user_pages(dma->map, dma->page_count); dma->page_count = 0; return -ENOMEM; } @@ -153,7 +150,6 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr, void ivtv_udma_unmap(struct ivtv *itv) { struct ivtv_user_dma *dma = &itv->udma; - int i; IVTV_DEBUG_INFO("ivtv_unmap_user_dma\n"); @@ -169,10 +165,7 @@ void ivtv_udma_unmap(struct ivtv *itv) /* sync DMA */ ivtv_udma_sync_for_cpu(itv); - /* Release User Pages */ - for (i = 0; i < dma->page_count; i++) { - put_page(dma->map[i]); - } + unpin_user_pages(dma->map, dma->page_count); dma->page_count = 0; } diff --git a/drivers/media/pci/ivtv/ivtv-yuv.c b/drivers/media/pci/ivtv/ivtv-yuv.c index cd2fe2d444c0..5f7dc9771f8d 100644 --- a/drivers/media/pci/ivtv/ivtv-yuv.c +++ b/drivers/media/pci/ivtv/ivtv-yuv.c @@ -30,7 +30,6 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, struct yuv_playback_info *yi = &itv->yuv_info; u8 frame = yi->draw_frame; struct yuv_frame_info *f = &yi->new_frame_info[frame]; - int i; int y_pages, uv_pages; unsigned long y_buffer_offset, uv_buffer_offset; int y_decode_height, uv_decode_height, y_size; @@ -62,12 +61,12 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, ivtv_udma_get_page_info (&y_dma, (unsigned long)args->y_source, 720 * y_decode_height); ivtv_udma_get_page_info (&uv_dma, (unsigned long)args->uv_source, 360 * uv_decode_height); - /* Get user pages for DMA Xfer */ - y_pages = get_user_pages_unlocked(y_dma.uaddr, + /* Pin user pages for DMA Xfer */ + y_pages = pin_user_pages_unlocked(y_dma.uaddr, y_dma.page_count, &dma->map[0], FOLL_FORCE); uv_pages = 0; /* silence gcc. value is set and consumed only if: */ if (y_pages == y_dma.page_count) { - uv_pages = get_user_pages_unlocked(uv_dma.uaddr, + uv_pages = pin_user_pages_unlocked(uv_dma.uaddr, uv_dma.page_count, &dma->map[y_pages], FOLL_FORCE); } @@ -81,8 +80,7 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, uv_pages, uv_dma.page_count); if (uv_pages >= 0) { - for (i = 0; i < uv_pages; i++) - put_page(dma->map[y_pages + i]); + unpin_user_pages(&dma->map[y_pages], uv_pages); rc = -EFAULT; } else { rc = uv_pages; @@ -93,8 +91,7 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, y_pages, y_dma.page_count); } if (y_pages >= 0) { - for (i = 0; i < y_pages; i++) - put_page(dma->map[i]); + unpin_user_pages(dma->map, y_pages); /* * Inherit the -EFAULT from rc's * initialization, but allow it to be @@ -112,9 +109,7 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, /* Fill & map SG List */ if (ivtv_udma_fill_sg_list (dma, &uv_dma, ivtv_udma_fill_sg_list (dma, &y_dma, 0)) < 0) { IVTV_DEBUG_WARN("could not allocate bounce buffers for highmem userspace buffers\n"); - for (i = 0; i < dma->page_count; i++) { - put_page(dma->map[i]); - } + unpin_user_pages(dma->map, dma->page_count); dma->page_count = 0; return -ENOMEM; } diff --git a/drivers/media/pci/ivtv/ivtvfb.c b/drivers/media/pci/ivtv/ivtvfb.c index 0c2859844081..e2d56dca5be4 100644 --- a/drivers/media/pci/ivtv/ivtvfb.c +++ b/drivers/media/pci/ivtv/ivtvfb.c @@ -281,10 +281,10 @@ static int ivtvfb_prep_dec_dma_to_device(struct ivtv *itv, /* Map User DMA */ if (ivtv_udma_setup(itv, ivtv_dest_addr, userbuf, size_in_bytes) <= 0) { mutex_unlock(&itv->udma.lock); - IVTVFB_WARN("ivtvfb_prep_dec_dma_to_device, Error with get_user_pages: %d bytes, %d pages returned\n", + IVTVFB_WARN("ivtvfb_prep_dec_dma_to_device, Error with pin_user_pages: %d bytes, %d pages returned\n", size_in_bytes, itv->udma.page_count); - /* get_user_pages must have failed completely */ + /* pin_user_pages must have failed completely */ return -EIO; } -- cgit v1.2.3 From 548b6a1e5520d5647e3d39c0aaf1f83ab27e250f Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Mon, 1 Jun 2020 21:48:33 -0700 Subject: mm/gup.c: further document vma_permits_fault() Describe the caller's responsibilities when passing FAULT_FLAG_ALLOW_RETRY. Link: http://lkml.kernel.org/r/1586915606.5647.5.camel@mtkswgap22 Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index c3014669988b..1d84291543fd 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1176,7 +1176,8 @@ static bool vma_permits_fault(struct vm_area_struct *vma, * @address: user address * @fault_flags:flags to pass down to handle_mm_fault() * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller - * does not allow retry + * does not allow retry. If NULL, the caller must guarantee + * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY. * * This is meant to be called in the specific scenario where for locking reasons * we try to access user memory in atomic context (within a pagefault_disable() -- cgit v1.2.3 From 213516ac0752144ad1c82a0f8393f805c8abbc76 Mon Sep 17 00:00:00 2001 From: chenqiwu Date: Mon, 1 Jun 2020 21:48:36 -0700 Subject: mm/swapfile: use list_{prev,next}_entry() instead of open-coding Use list_{prev,next}_entry() instead of list_entry() for better code readability. Signed-off-by: chenqiwu Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Wei Yang Cc: Michal Hocko Cc: Pankaj Gupta Cc: Yang Shi Cc: Qian Cai Cc: Baoquan He Link: http://lkml.kernel.org/r/1586599916-15456-2-git-send-email-qiwuchen55@gmail.com Signed-off-by: Linus Torvalds --- mm/swapfile.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 5871a2aa86a5..8d8dc674b87f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3654,7 +3654,7 @@ static bool swap_count_continued(struct swap_info_struct *si, spin_lock(&si->cont_lock); offset &= ~PAGE_MASK; - page = list_entry(head->lru.next, struct page, lru); + page = list_next_entry(head, lru); map = kmap_atomic(page) + offset; if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ @@ -3666,13 +3666,13 @@ static bool swap_count_continued(struct swap_info_struct *si, */ while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { kunmap_atomic(map); - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_atomic(page) + offset; } if (*map == SWAP_CONT_MAX) { kunmap_atomic(map); - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); if (page == head) { ret = false; /* add count continuation */ goto out; @@ -3682,12 +3682,10 @@ init_map: *map = 0; /* we didn't zero the page */ } *map += 1; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); - while (page != head) { + while ((page = list_prev_entry(page, lru)) != head) { map = kmap_atomic(page) + offset; *map = COUNT_CONTINUED; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); } ret = true; /* incremented */ @@ -3698,7 +3696,7 @@ init_map: *map = 0; /* we didn't zero the page */ BUG_ON(count != COUNT_CONTINUED); while (*map == COUNT_CONTINUED) { kunmap_atomic(map); - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); BUG_ON(page == head); map = kmap_atomic(page) + offset; } @@ -3707,13 +3705,11 @@ init_map: *map = 0; /* we didn't zero the page */ if (*map == 0) count = 0; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); - while (page != head) { + while ((page = list_prev_entry(page, lru)) != head) { map = kmap_atomic(page) + offset; *map = SWAP_CONT_MAX | count; count = COUNT_CONTINUED; kunmap_atomic(map); - page = list_entry(page->lru.prev, struct page, lru); } ret = count == COUNT_CONTINUED; } -- cgit v1.2.3 From d6c1f098f2a7ba62627c9bc17cda28f534ef9e4a Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 1 Jun 2020 21:48:40 -0700 Subject: mm/swap_state: fix a data race in swapin_nr_pages "prev_offset" is a static variable in swapin_nr_pages() that can be accessed concurrently with only mmap_sem held in read mode as noticed by KCSAN, BUG: KCSAN: data-race in swap_cluster_readahead / swap_cluster_readahead write to 0xffffffff92763830 of 8 bytes by task 14795 on cpu 17: swap_cluster_readahead+0x2a6/0x5e0 swapin_readahead+0x92/0x8dc do_swap_page+0x49b/0xf20 __handle_mm_fault+0xcfb/0xd70 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x715 page_fault+0x34/0x40 1 lock held by (dnf)/14795: #0: ffff897bd2e98858 (&mm->mmap_sem#2){++++}-{3:3}, at: do_page_fault+0x143/0x715 do_user_addr_fault at arch/x86/mm/fault.c:1405 (inlined by) do_page_fault at arch/x86/mm/fault.c:1535 irq event stamp: 83493 count_memcg_event_mm+0x1a6/0x270 count_memcg_event_mm+0x119/0x270 __do_softirq+0x365/0x589 irq_exit+0xa2/0xc0 read to 0xffffffff92763830 of 8 bytes by task 1 on cpu 22: swap_cluster_readahead+0xfd/0x5e0 swapin_readahead+0x92/0x8dc do_swap_page+0x49b/0xf20 __handle_mm_fault+0xcfb/0xd70 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x715 page_fault+0x34/0x40 1 lock held by systemd/1: #0: ffff897c38f14858 (&mm->mmap_sem#2){++++}-{3:3}, at: do_page_fault+0x143/0x715 irq event stamp: 43530289 count_memcg_event_mm+0x1a6/0x270 count_memcg_event_mm+0x119/0x270 __do_softirq+0x365/0x589 irq_exit+0xa2/0xc0 Signed-off-by: Qian Cai Signed-off-by: Andrew Morton Cc: Marco Elver Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200402213748.2237-1-cai@lca.pw Signed-off-by: Linus Torvalds --- mm/swap_state.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index ebed37bbf7a3..8238954ae781 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -509,10 +509,11 @@ static unsigned long swapin_nr_pages(unsigned long offset) return 1; hits = atomic_xchg(&swapin_readahead_hits, 0); - pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages, + pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, + max_pages, atomic_read(&last_readahead_pages)); if (!hits) - prev_offset = offset; + WRITE_ONCE(prev_offset, offset); atomic_set(&last_readahead_pages, pages); return pages; -- cgit v1.2.3 From ebc5951eea499314f6fbbde20e295f1345c67330 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 1 Jun 2020 21:48:43 -0700 Subject: mm: swap: properly update readahead statistics in unuse_pte_range() In unuse_pte_range() we blindly swap-in pages without checking if the swap entry is already present in the swap cache. By doing this, the hit/miss ratio used by the swap readahead heuristic is not properly updated and this leads to non-optimal performance during swapoff. Tracing the distribution of the readahead size returned by the swap readahead heuristic during swapoff shows that a small readahead size is used most of the time as if we had only misses (this happens both with cluster and vma readahead), for example: r::swapin_nr_pages(unsigned long offset):unsigned long:$retval COUNT EVENT 36948 $retval = 8 44151 $retval = 4 49290 $retval = 1 527771 $retval = 2 Checking if the swap entry is present in the swap cache, instead, allows to properly update the readahead statistics and the heuristic behaves in a better way during swapoff, selecting a bigger readahead size: r::swapin_nr_pages(unsigned long offset):unsigned long:$retval COUNT EVENT 1618 $retval = 1 4960 $retval = 2 41315 $retval = 4 103521 $retval = 8 In terms of swapoff performance the result is the following: Testing environment =================== - Host: CPU: 1.8GHz Intel Core i7-8565U (quad-core, 8MB cache) HDD: PC401 NVMe SK hynix 512GB MEM: 16GB - Guest (kvm): 8GB of RAM virtio block driver 16GB swap file on ext4 (/swapfile) Test case ========= - allocate 85% of memory - `systemctl hibernate` to force all the pages to be swapped-out to the swap file - resume the system - measure the time that swapoff takes to complete: # /usr/bin/time swapoff /swapfile Result (swapoff time) ====== 5.6 vanilla 5.6 w/ this patch ----------- ----------------- cluster-readahead 22.09s 12.19s vma-readahead 18.20s 15.33s Conclusion ========== The specific use case this patch is addressing is to improve swapoff performance in cloud environments when a VM has been hibernated, resumed and all the memory needs to be forced back to RAM by disabling swap. This change allows to better exploits the advantages of the readahead heuristic during swapoff and this improvement allows to to speed up the resume process of such VMs. [andrea.righi@canonical.com: update changelog] Link: http://lkml.kernel.org/r/20200418084705.GA147642@xps-13 Signed-off-by: Andrea Righi Signed-off-by: Andrew Morton Reviewed-by: "Huang, Ying" Cc: Minchan Kim Cc: Anchal Agarwal Cc: Hugh Dickins Cc: Vineeth Remanan Pillai Cc: Kelley Nielsen Link: http://lkml.kernel.org/r/20200416180132.GB3352@xps-13 Signed-off-by: Linus Torvalds --- mm/swapfile.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 8d8dc674b87f..106ff9c1a6b9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1937,10 +1937,14 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_unmap(pte); swap_map = &si->swap_map[offset]; - vmf.vma = vma; - vmf.address = addr; - vmf.pmd = pmd; - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); + page = lookup_swap_cache(entry, vma, addr); + if (!page) { + vmf.vma = vma; + vmf.address = addr; + vmf.pmd = pmd; + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + &vmf); + } if (!page) { if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) goto try_next; -- cgit v1.2.3 From ca2c55a7371ca3db64f5ac9ed2cb998682bf6fae Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 1 Jun 2020 21:48:46 -0700 Subject: mm/swapfile.c: offset is only used when there is more slots scan_swap_map_slots() is used to iterate swap_map[] array for an available swap entry. While after several optimizations, e.g. for ssd case, the logic of this function is a little not easy to catch. This patchset tries to clean up the logic a little: * shows the ssd/non-ssd case is handled mutually exclusively * remove some unnecessary goto for ssd case This patch (of 3): When si->cluster_nr is zero, function would reach done and return. The increased offset would not be used any more. This means we can move the offset increment into the if clause. This brings a further code cleanup possibility. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Tim Chen Link: http://lkml.kernel.org/r/20200328060520.31449-1-richard.weiyang@gmail.com Link: http://lkml.kernel.org/r/20200328060520.31449-2-richard.weiyang@gmail.com Signed-off-by: Linus Torvalds --- mm/swapfile.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 106ff9c1a6b9..b6e68dcf7a18 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -874,11 +874,9 @@ checks: else goto done; } - /* non-ssd case */ - ++offset; /* non-ssd case, still more slots in cluster? */ - if (si->cluster_nr && !si->swap_map[offset]) { + if (si->cluster_nr && !si->swap_map[++offset]) { --si->cluster_nr; goto checks; } -- cgit v1.2.3 From f4eaf51a72e5c6d9dc895abdd02676a2ef4c3ae7 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 1 Jun 2020 21:48:49 -0700 Subject: mm/swapfile.c: explicitly show ssd/non-ssd is handled mutually exclusive The code shows if this is ssd, it will jump to specific tag and skip the following code for non-ssd. Let's use "else if" to explicitly show the mutually exclusion for ssd/non-ssd to reduce ambiguity. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Tim Chen Link: http://lkml.kernel.org/r/20200328060520.31449-3-richard.weiyang@gmail.com Signed-off-by: Linus Torvalds --- mm/swapfile.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index b6e68dcf7a18..264d5837257a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -763,9 +763,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, goto checks; else goto scan; - } - - if (unlikely(!si->cluster_nr--)) { + } else if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; @@ -873,10 +871,8 @@ checks: goto checks; else goto done; - } - - /* non-ssd case, still more slots in cluster? */ - if (si->cluster_nr && !si->swap_map[++offset]) { + } else if (si->cluster_nr && !si->swap_map[++offset]) { + /* non-ssd case, still more slots in cluster? */ --si->cluster_nr; goto checks; } -- cgit v1.2.3 From bd2d18da4a4f182c3627a74dc768bd437e64c582 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 1 Jun 2020 21:48:52 -0700 Subject: mm/swapfile.c: remove the unnecessary goto for SSD case Now we can see there is redundant goto for SSD case. In these two places, we can just let the code walk through to the correct tag instead of explicitly jump to it. Let's remove them for better readability. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Tim Chen Link: http://lkml.kernel.org/r/20200328060520.31449-4-richard.weiyang@gmail.com Signed-off-by: Linus Torvalds --- mm/swapfile.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 264d5837257a..f3af9901d8cd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -759,9 +759,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, /* SSD algorithm */ if (si->cluster_info) { - if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) - goto checks; - else + if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto scan; } else if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { @@ -869,8 +867,6 @@ checks: if (si->cluster_info) { if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base)) goto checks; - else - goto done; } else if (si->cluster_nr && !si->swap_map[++offset]) { /* non-ssd case, still more slots in cluster? */ --si->cluster_nr; -- cgit v1.2.3 From 08d3090fc8dadd7b726dbda99d1baa39382c3f2c Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 1 Jun 2020 21:48:55 -0700 Subject: mm/swapfile.c: simplify the calculation of n_goal Use min3() to simplify the comparison and make it more self-explaining. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200325220309.9803-1-richard.weiyang@gmail.com Signed-off-by: Linus Torvalds --- mm/swapfile.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index f3af9901d8cd..f3b0a2c4972a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -994,11 +994,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) if (avail_pgs <= 0) goto noswap; - if (n_goal > SWAP_BATCH) - n_goal = SWAP_BATCH; - - if (n_goal > avail_pgs) - n_goal = avail_pgs; + n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); atomic_long_sub(n_goal * size, &nr_swap_pages); -- cgit v1.2.3 From abca1c84b7575b470789c8686688b11ba3bf138d Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 1 Jun 2020 21:48:58 -0700 Subject: mm/swapfile.c: remove the extra check in scan_swap_map_slots() scan_swap_map_slots() is only called by scan_swap_map() and get_swap_pages(). Both ensure nr would not exceed SWAP_BATCH. Just remove it. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200325220309.9803-2-richard.weiyang@gmail.com Signed-off-by: Linus Torvalds --- mm/swapfile.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index f3b0a2c4972a..705b11a7a463 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -740,9 +740,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, int latency_ration = LATENCY_LIMIT; int n_ret = 0; - if (nr > SWAP_BATCH) - nr = SWAP_BATCH; - /* * We try to cluster swap pages by allocating them sequentially * in swap. Once we've allocated SWAPFILE_CLUSTER pages this -- cgit v1.2.3 From 0fd0e19e4d0e9c84117584783535a57ec3ae98c6 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 1 Jun 2020 21:49:01 -0700 Subject: mm/swapfile.c: found_free could be represented by (tmp < max) This is not necessary to use the variable found_free to record the status. Just check tmp and max is enough. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Reviewed-by: "Huang, Ying" Cc: Tim Chen Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200421213824.8099-1-richard.weiyang@gmail.com Signed-off-by: Linus Torvalds --- mm/swapfile.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 705b11a7a463..263e3d86a114 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -601,7 +601,6 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, { struct percpu_cluster *cluster; struct swap_cluster_info *ci; - bool found_free; unsigned long tmp, max; new_cluster: @@ -623,8 +622,6 @@ new_cluster: return false; } - found_free = false; - /* * Other CPUs can use our cluster if they can't find a free cluster, * check if there is still free entry in the cluster @@ -638,21 +635,19 @@ new_cluster: } ci = lock_cluster(si, tmp); while (tmp < max) { - if (!si->swap_map[tmp]) { - found_free = true; + if (!si->swap_map[tmp]) break; - } tmp++; } unlock_cluster(ci); - if (!found_free) { + if (tmp >= max) { cluster_set_null(&cluster->index); goto new_cluster; } cluster->next = tmp + 1; *offset = tmp; *scan_base = tmp; - return found_free; + return tmp < max; } static void __del_from_avail_list(struct swap_info_struct *p) -- cgit v1.2.3 From fdff1debb9650ea43617487500f48168ce432cb7 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 1 Jun 2020 21:49:04 -0700 Subject: mm/swapfile.c: tmp is always smaller than max If tmp is bigger or equal to max, we would jump to new_cluster. Return true directly. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Reviewed-by: "Huang, Ying" Cc: Tim Chen Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200421213824.8099-2-richard.weiyang@gmail.com Signed-off-by: Linus Torvalds --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 263e3d86a114..c74c9e1dc50d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -647,7 +647,7 @@ new_cluster: cluster->next = tmp + 1; *offset = tmp; *scan_base = tmp; - return tmp < max; + return true; } static void __del_from_avail_list(struct swap_info_struct *p) -- cgit v1.2.3 From 7b9e2de130954af174e6f7aff278cf1ec5f39675 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 1 Jun 2020 21:49:07 -0700 Subject: mm/swapfile.c: omit a duplicate code by compare tmp and max first There are two duplicate code to handle the case when there is no available swap entry. To avoid this, we can compare tmp and max first and let the second guard do its job. No functional change is expected. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Reviewed-by: "Huang, Ying" Cc: Tim Chen Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200421213824.8099-3-richard.weiyang@gmail.com Signed-off-by: Linus Torvalds --- mm/swapfile.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index c74c9e1dc50d..1743386d2401 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -629,17 +629,15 @@ new_cluster: tmp = cluster->next; max = min_t(unsigned long, si->max, (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER); - if (tmp >= max) { - cluster_set_null(&cluster->index); - goto new_cluster; - } - ci = lock_cluster(si, tmp); - while (tmp < max) { - if (!si->swap_map[tmp]) - break; - tmp++; + if (tmp < max) { + ci = lock_cluster(si, tmp); + while (tmp < max) { + if (!si->swap_map[tmp]) + break; + tmp++; + } + unlock_cluster(ci); } - unlock_cluster(ci); if (tmp >= max) { cluster_set_null(&cluster->index); goto new_cluster; -- cgit v1.2.3 From ed43af10975eef7e21abbb81297d9735448ba4fa Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 1 Jun 2020 21:49:10 -0700 Subject: swap: try to scan more free slots even when fragmented Now, the scalability of swap code will drop much when the swap device becomes fragmented, because the swap slots allocation batching stops working. To solve the problem, in this patch, we will try to scan a little more swap slots with restricted effort to batch the swap slots allocation even if the swap device is fragmented. Test shows that the benchmark score can increase up to 37.1% with the patch. Details are as follows. The swap code has a per-cpu cache of swap slots. These batch swap space allocations to improve swap subsystem scaling. In the following code path, add_to_swap() get_swap_page() refill_swap_slots_cache() get_swap_pages() scan_swap_map_slots() scan_swap_map_slots() and get_swap_pages() can return multiple swap slots for each call. These slots will be cached in the per-CPU swap slots cache, so that several following swap slot requests will be fulfilled there to avoid the lock contention in the lower level swap space allocation/freeing code path. But this only works when there are free swap clusters. If a swap device becomes so fragmented that there's no free swap clusters, scan_swap_map_slots() and get_swap_pages() will return only one swap slot for each call in the above code path. Effectively, this falls back to the situation before the swap slots cache was introduced, the heavy lock contention on the swap related locks kills the scalability. Why does it work in this way? Because the swap device could be large, and the free swap slot scanning could be quite time consuming, to avoid taking too much time to scanning free swap slots, the conservative method was used. In fact, this can be improved via scanning a little more free slots with strictly restricted effort. Which is implemented in this patch. In scan_swap_map_slots(), after the first free swap slot is gotten, we will try to scan a little more, but only if we haven't scanned too many slots (< LATENCY_LIMIT). That is, the added scanning latency is strictly restricted. To test the patch, we have run 16-process pmbench memory benchmark on a 2-socket server machine with 48 cores. Multiple ram disks are configured as the swap devices. The pmbench working-set size is much larger than the available memory so that swapping is triggered. The memory read/write ratio is 80/20 and the accessing pattern is random, so the swap space becomes highly fragmented during the test. In the original implementation, the lock contention on swap related locks is very heavy. The perf profiling data of the lock contention code path is as following, _raw_spin_lock.get_swap_pages.get_swap_page.add_to_swap: 21.03 _raw_spin_lock_irq.shrink_inactive_list.shrink_lruvec.shrink_node: 1.92 _raw_spin_lock_irq.shrink_active_list.shrink_lruvec.shrink_node: 1.72 _raw_spin_lock.free_pcppages_bulk.drain_pages_zone.drain_pages: 0.69 While after applying this patch, it becomes, _raw_spin_lock_irq.shrink_inactive_list.shrink_lruvec.shrink_node: 4.89 _raw_spin_lock_irq.shrink_active_list.shrink_lruvec.shrink_node: 3.85 _raw_spin_lock.free_pcppages_bulk.drain_pages_zone.drain_pages: 1.1 _raw_spin_lock_irqsave.pagevec_lru_move_fn.__lru_cache_add.do_swap_page: 0.88 That is, the lock contention on the swap locks is eliminated. And the pmbench score increases 37.1%. The swapin throughput increases 45.7% from 2.02 GB/s to 2.94 GB/s. While the swapout throughput increases 45.3% from 2.04 GB/s to 2.97 GB/s. Signed-off-by: "Huang, Ying" Signed-off-by: Andrew Morton Acked-by: Tim Chen Cc: Dave Hansen Cc: Michal Hocko Cc: Minchan Kim Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200427030023.264780-1-ying.huang@intel.com Signed-off-by: Linus Torvalds --- mm/swapfile.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/mm/swapfile.c b/mm/swapfile.c index 1743386d2401..b365cbe99cc3 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -732,6 +732,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; int n_ret = 0; + bool scanned_many = false; /* * We try to cluster swap pages by allocating them sequentially @@ -863,6 +864,25 @@ checks: goto checks; } + /* + * Even if there's no free clusters available (fragmented), + * try to scan a little more quickly with lock held unless we + * have scanned too many slots already. + */ + if (!scanned_many) { + unsigned long scan_limit; + + if (offset < scan_base) + scan_limit = scan_base; + else + scan_limit = si->highest_bit; + for (; offset <= scan_limit && --latency_ration > 0; + offset++) { + if (!si->swap_map[offset]) + goto checks; + } + } + done: si->flags -= SWP_SCANNING; return n_ret; @@ -881,6 +901,7 @@ scan: if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; + scanned_many = true; } } offset = si->lowest_bit; @@ -896,6 +917,7 @@ scan: if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; + scanned_many = true; } offset++; } -- cgit v1.2.3 From 4b4bb6bb451c8b0c1cbc38fa20a89a3988aa4e0b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 1 Jun 2020 21:49:13 -0700 Subject: mm/swapfile.c: classify SWAP_MAP_XXX to make it more readable swap_info_struct->swap_map[] encodes some flag and count. And to do some condition check, it also introduces some special values. Currently those macros are defined with some magic order, which makes audience hard to understand the exact meaning. This patch split those macros into three categories: flag special value for first swap_map special value for continued swap_map May this help audiences a little. [akpm@linux-foundation.org: tweak capitalization in comments] Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: http://lkml.kernel.org/r/20200501015259.32237-1-richard.weiyang@gmail.com Signed-off-by: Linus Torvalds --- include/linux/swap.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index e1bbf7a16b27..873bf5206afb 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -183,12 +183,17 @@ enum { #define SWAP_CLUSTER_MAX 32UL #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX -#define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */ -#define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */ +/* Bit flag in swap_map */ #define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ -#define SWAP_CONT_MAX 0x7f /* Max count, in each swap_map continuation */ -#define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */ -#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs, in first swap_map */ +#define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ + +/* Special value in first swap_map */ +#define SWAP_MAP_MAX 0x3e /* Max count */ +#define SWAP_MAP_BAD 0x3f /* Note page is bad */ +#define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs */ + +/* Special value in each swap_map continuation */ +#define SWAP_CONT_MAX 0x7f /* Max count */ /* * We use this to track usage of a cluster. A cluster is a block of swap disk -- cgit v1.2.3 From 33e16272fe982084e986d4eb6d5b5ca51c0d214e Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 1 Jun 2020 21:49:16 -0700 Subject: mm/swapfile.c: __swap_entry_free() always free 1 entry __swap_entry_free() always frees 1 entry. Let's remove the usage. Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: http://lkml.kernel.org/r/20200501015259.32237-2-richard.weiyang@gmail.com Signed-off-by: Linus Torvalds --- mm/swapfile.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index b365cbe99cc3..2aa272376cae 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1273,13 +1273,14 @@ unlock_out: } static unsigned char __swap_entry_free(struct swap_info_struct *p, - swp_entry_t entry, unsigned char usage) + swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); + unsigned char usage; ci = lock_cluster_or_swap_info(p, offset); - usage = __swap_entry_free_locked(p, offset, usage); + usage = __swap_entry_free_locked(p, offset, 1); unlock_cluster_or_swap_info(p, ci); if (!usage) free_swap_slot(entry); @@ -1314,7 +1315,7 @@ void swap_free(swp_entry_t entry) p = _swap_info_get(entry); if (p) - __swap_entry_free(p, entry, 1); + __swap_entry_free(p, entry); } /* @@ -1737,7 +1738,7 @@ int free_swap_and_cache(swp_entry_t entry) p = _swap_info_get(entry); if (p) { - count = __swap_entry_free(p, entry, 1); + count = __swap_entry_free(p, entry); if (count == SWAP_HAS_CACHE && !swap_page_trans_huge_swapped(p, entry)) __try_to_reclaim_swap(p, swp_offset(entry), -- cgit v1.2.3 From 09fe06ce0bf5abe53b77a9515d7fb7579edec9c0 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 1 Jun 2020 21:49:19 -0700 Subject: mm/swapfile.c: use prandom_u32_max() To improve the code readability and take advantage of the common implementation. Signed-off-by: "Huang, Ying" Signed-off-by: Andrew Morton Acked-by: Michal Hocko Cc: Minchan Kim Cc: Tim Chen Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200512081013.520201-1-ying.huang@intel.com Signed-off-by: Linus Torvalds --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 2aa272376cae..18dfccb91123 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3209,7 +3209,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) * select a random position to start with to help wear leveling * SSD */ - p->cluster_next = 1 + (prandom_u32() % p->highest_bit); + p->cluster_next = 1 + prandom_u32_max(p->highest_bit); nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), -- cgit v1.2.3 From 490705888107c3edf8c264ec930909107f76a984 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 1 Jun 2020 21:49:22 -0700 Subject: swap: reduce lock contention on swap cache from swap slots allocation In some swap scalability test, it is found that there are heavy lock contention on swap cache even if we have split one swap cache radix tree per swap device to one swap cache radix tree every 64 MB trunk in commit 4b3ef9daa4fc ("mm/swap: split swap cache into 64MB trunks"). The reason is as follow. After the swap device becomes fragmented so that there's no free swap cluster, the swap device will be scanned linearly to find the free swap slots. swap_info_struct->cluster_next is the next scanning base that is shared by all CPUs. So nearby free swap slots will be allocated for different CPUs. The probability for multiple CPUs to operate on the same 64 MB trunk is high. This causes the lock contention on the swap cache. To solve the issue, in this patch, for SSD swap device, a percpu version next scanning base (cluster_next_cpu) is added. Every CPU will use its own per-cpu next scanning base. And after finishing scanning a 64MB trunk, the per-cpu scanning base will be changed to the beginning of another randomly selected 64MB trunk. In this way, the probability for multiple CPUs to operate on the same 64 MB trunk is reduced greatly. Thus the lock contention is reduced too. For HDD, because sequential access is more important for IO performance, the original shared next scanning base is used. To test the patch, we have run 16-process pmbench memory benchmark on a 2-socket server machine with 48 cores. One ram disk is configured as the swap device per socket. The pmbench working-set size is much larger than the available memory so that swapping is triggered. The memory read/write ratio is 80/20 and the accessing pattern is random. In the original implementation, the lock contention on the swap cache is heavy. The perf profiling data of the lock contention code path is as following, _raw_spin_lock_irq.add_to_swap_cache.add_to_swap.shrink_page_list: 7.91 _raw_spin_lock_irqsave.__remove_mapping.shrink_page_list: 7.11 _raw_spin_lock.swapcache_free_entries.free_swap_slot.__swap_entry_free: 2.51 _raw_spin_lock_irqsave.swap_cgroup_record.mem_cgroup_uncharge_swap: 1.66 _raw_spin_lock_irq.shrink_inactive_list.shrink_lruvec.shrink_node: 1.29 _raw_spin_lock.free_pcppages_bulk.drain_pages_zone.drain_pages: 1.03 _raw_spin_lock_irq.shrink_active_list.shrink_lruvec.shrink_node: 0.93 After applying this patch, it becomes, _raw_spin_lock.swapcache_free_entries.free_swap_slot.__swap_entry_free: 3.58 _raw_spin_lock_irq.shrink_inactive_list.shrink_lruvec.shrink_node: 2.3 _raw_spin_lock_irqsave.swap_cgroup_record.mem_cgroup_uncharge_swap: 2.26 _raw_spin_lock_irq.shrink_active_list.shrink_lruvec.shrink_node: 1.8 _raw_spin_lock.free_pcppages_bulk.drain_pages_zone.drain_pages: 1.19 The lock contention on the swap cache is almost eliminated. And the pmbench score increases 18.5%. The swapin throughput increases 18.7% from 2.96 GB/s to 3.51 GB/s. While the swapout throughput increases 18.5% from 2.99 GB/s to 3.54 GB/s. We need really fast disk to show the benefit. I have tried this on 2 Intel P3600 NVMe disks. The performance improvement is only about 1%. The improvement should be better on the faster disks, such as Intel Optane disk. [ying.huang@intel.com: fix cluster_next_cpu allocation and freeing, per Daniel] Link: http://lkml.kernel.org/r/20200525002648.336325-1-ying.huang@intel.com [ying.huang@intel.com: v4] Link: http://lkml.kernel.org/r/20200529010840.928819-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Signed-off-by: Andrew Morton Reviewed-by: Daniel Jordan Cc: Michal Hocko Cc: Minchan Kim Cc: Tim Chen Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200520031502.175659-1-ying.huang@intel.com Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 + mm/swapfile.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 873bf5206afb..6c23a6a14012 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -252,6 +252,7 @@ struct swap_info_struct { unsigned int inuse_pages; /* number of those currently in use */ unsigned int cluster_next; /* likely index for next allocation */ unsigned int cluster_nr; /* countdown to next cluster search */ + unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 18dfccb91123..c531d2687dd0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -613,10 +613,12 @@ new_cluster: } else if (!cluster_list_empty(&si->discard_clusters)) { /* * we don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them + * discarding, do discard now and reclaim them, then + * reread cluster_next_cpu since we dropped si->lock */ swap_do_scheduled_discard(si); - *scan_base = *offset = si->cluster_next; + *scan_base = this_cpu_read(*si->cluster_next_cpu); + *offset = *scan_base; goto new_cluster; } else return false; @@ -722,6 +724,34 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, } } +static void set_cluster_next(struct swap_info_struct *si, unsigned long next) +{ + unsigned long prev; + + if (!(si->flags & SWP_SOLIDSTATE)) { + si->cluster_next = next; + return; + } + + prev = this_cpu_read(*si->cluster_next_cpu); + /* + * Cross the swap address space size aligned trunk, choose + * another trunk randomly to avoid lock contention on swap + * address space if possible. + */ + if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != + (next >> SWAP_ADDRESS_SPACE_SHIFT)) { + /* No free swap slots available */ + if (si->highest_bit <= si->lowest_bit) + return; + next = si->lowest_bit + + prandom_u32_max(si->highest_bit - si->lowest_bit + 1); + next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); + next = max_t(unsigned int, next, si->lowest_bit); + } + this_cpu_write(*si->cluster_next_cpu, next); +} + static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[]) @@ -746,7 +776,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si, */ si->flags += SWP_SCANNING; - scan_base = offset = si->cluster_next; + /* + * Use percpu scan base for SSD to reduce lock contention on + * cluster and swap cache. For HDD, sequential access is more + * important. + */ + if (si->flags & SWP_SOLIDSTATE) + scan_base = this_cpu_read(*si->cluster_next_cpu); + else + scan_base = si->cluster_next; + offset = scan_base; /* SSD algorithm */ if (si->cluster_info) { @@ -835,7 +874,6 @@ checks: unlock_cluster(ci); swap_range_alloc(si, offset, 1); - si->cluster_next = offset + 1; slots[n_ret++] = swp_entry(si->type, offset); /* got enough slots or reach max slots? */ @@ -884,6 +922,7 @@ checks: } done: + set_cluster_next(si, offset + 1); si->flags -= SWP_SCANNING; return n_ret; @@ -2653,6 +2692,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; + free_percpu(p->cluster_next_cpu); + p->cluster_next_cpu = NULL; vfree(swap_map); kvfree(cluster_info); kvfree(frontswap_map); @@ -3205,11 +3246,19 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) unsigned long ci, nr_cluster; p->flags |= SWP_SOLIDSTATE; + p->cluster_next_cpu = alloc_percpu(unsigned int); + if (!p->cluster_next_cpu) { + error = -ENOMEM; + goto bad_swap_unlock_inode; + } /* * select a random position to start with to help wear leveling * SSD */ - p->cluster_next = 1 + prandom_u32_max(p->highest_bit); + for_each_possible_cpu(cpu) { + per_cpu(*p->cluster_next_cpu, cpu) = + 1 + prandom_u32_max(p->highest_bit); + } nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info), @@ -3325,6 +3374,8 @@ bad_swap_unlock_inode: bad_swap: free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; + free_percpu(p->cluster_next_cpu); + p->cluster_next_cpu = NULL; if (inode && S_ISBLK(inode->i_mode) && p->bdev) { set_blocksize(p->bdev, p->old_block_size); blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); -- cgit v1.2.3 From 6f7939405f61de7d0da7f6c90182e96c4f5ff6c1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 1 Jun 2020 21:49:26 -0700 Subject: mm: swapfile: fix /proc/swaps heading and Size/Used/Priority alignment Fix the heading and Size/Used/Priority field alignments in /proc/swaps. If the Size and/or Used value is >= 10000000 (8 bytes), then the alignment by using tab characters is broken. This patch maintains the use of tabs for alignment. If spaces are preferred, we can just use a Field Width specifier for the bytes and inuse fields. That way those fields don't have to be a multiple of 8 bytes in width. E.g., with a field width of 12, both Size and Used would always fit on the first line of an 80-column wide terminal (only Priority would be on the second line). There are actually 2 problems: heading alignment and field width. On an xterm, if Used is 7 bytes in length, the tab does nothing, and the display is like this, with no space/tab between the Used and Priority fields. (ugh) Filename Type Size Used Priority /dev/sda8 partition 16779260 2023012-1 To be clear, if one does 'cat /proc/swaps >/tmp/proc.swaps', it does look different, like so: Filename Type Size Used Priority /dev/sda8 partition 16779260 2086988 -1 Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Hugh Dickins Cc: Alexander Viro Link: http://lkml.kernel.org/r/c0ffb41a-81ac-ddfa-d452-a9229ecc0387@infradead.org Signed-off-by: Linus Torvalds --- mm/swapfile.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index c531d2687dd0..63ac67208453 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2801,20 +2801,24 @@ static int swap_show(struct seq_file *swap, void *v) struct swap_info_struct *si = v; struct file *file; int len; + unsigned int bytes, inuse; if (si == SEQ_START_TOKEN) { - seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); + seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); return 0; } + bytes = si->pages << (PAGE_SHIFT - 10); + inuse = si->inuse_pages << (PAGE_SHIFT - 10); + file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); - seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", + seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file_inode(file)->i_mode) ? "partition" : "file\t", - si->pages << (PAGE_SHIFT - 10), - si->inuse_pages << (PAGE_SHIFT - 10), + bytes, bytes < 10000000 ? "\t" : "", + inuse, inuse < 10000000 ? "\t" : "", si->prio); return 0; } -- cgit v1.2.3 From 251af0cda614e5ad6bfda059bd120aed7432891d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 1 Jun 2020 21:49:29 -0700 Subject: include/linux/swap.h: delete meaningless __add_to_swap_cache() declaration Since commit 8d93b41c09d1 ("mm: Convert add_to_swap_cache to XArray"), __add_to_swap_cache and add_to_swap_cache are combined into one function. There is no __add_to_swap_cache() anymore. Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Reviewed-by: "Huang, Ying" Link: http://lkml.kernel.org/r/1590810326-2493-1-git-send-email-linmiaohe@huawei.com Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 6c23a6a14012..68ef7638311f 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -414,7 +414,6 @@ extern unsigned long total_swapcache_pages(void); extern void show_swap_cache_info(void); extern int add_to_swap(struct page *page); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); -extern int __add_to_swap_cache(struct page *page, swp_entry_t entry); extern void __delete_from_swap_cache(struct page *, swp_entry_t entry); extern void delete_from_swap_cache(struct page *); extern void free_page_and_swap_cache(struct page *); -- cgit v1.2.3 From a6f5576bb195c3b7508e3e1c98d2dcf6691f96e8 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 1 Jun 2020 21:49:32 -0700 Subject: mm, memcg: add workingset_restore in memory.stat There's a new workingset counter introduced in commit 1899ad18c607 ("mm: workingset: tell cache transitions from workingset thrashing"). With the help of this counter we can know the workingset is transitioning or thrashing. To leverage the benifit of this counter to memcg, we should introduce it into memory.stat. Then we could know the workingset of the workload inside a memcg better. Bellow is the verification of this new counter in memory.stat. Read a file into the memory and then read it again to make these pages be active. The size of this file is 1G. (memory.max is greater than file size) The counters in memory.stat will be inactive_file 0 active_file 1073639424 workingset_refault 0 workingset_activate 0 workingset_restore 0 workingset_nodereclaim 0 Trigger the memcg reclaim by setting a lower value to memory.high, and then some pages will be demoted into inactive list, and then some pages in the inactive list will be evicted into the storage. inactive_file 498094080 active_file 310063104 workingset_refault 0 workingset_activate 0 workingset_restore 0 workingset_nodereclaim 0 Then recover the memory.high and read the file into memory again. As a result of it, the transitioning will occur. Bellow is the result of this transitioning, inactive_file 498094080 active_file 575397888 workingset_refault 64746 workingset_activate 64746 workingset_restore 64746 workingset_nodereclaim 0 Signed-off-by: Yafang Shao Signed-off-by: Andrew Morton Acked-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Chris Down Cc: Peter Zijlstra (Intel) Cc: Suren Baghdasaryan Cc: Shakeel Butt Link: http://lkml.kernel.org/r/20200504153522.11553-1-laoar.shao@gmail.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 4 ++++ mm/memcontrol.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index bcc80269bb6a..5f12f203822e 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1329,6 +1329,10 @@ PAGE_SIZE multiple when read back. workingset_activate Number of refaulted pages that were immediately activated + workingset_restore + Number of restored pages which have been detected as an active + workingset before they got reclaimed. + workingset_nodereclaim Number of times a shadow node has been reclaimed diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1db4b285c407..4bb922c02521 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1451,6 +1451,8 @@ static char *memory_stat_format(struct mem_cgroup *memcg) memcg_page_state(memcg, WORKINGSET_REFAULT)); seq_buf_printf(&s, "workingset_activate %lu\n", memcg_page_state(memcg, WORKINGSET_ACTIVATE)); + seq_buf_printf(&s, "workingset_restore %lu\n", + memcg_page_state(memcg, WORKINGSET_RESTORE)); seq_buf_printf(&s, "workingset_nodereclaim %lu\n", memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); -- cgit v1.2.3 From 1c4448edb71ab9da35a938244602cdd75809de2b Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Mon, 1 Jun 2020 21:49:36 -0700 Subject: mm: memcontrol: simplify value comparison between count and limit When the variables count and limit have the same value(count == limit), the result of min(margin, limit - count) statement should be 0 and the variable margin is set to 0. So in this case, the min() statement is not necessary and we can directly set the variable margin to 0. Signed-off-by: Kaixu Xia Signed-off-by: Andrew Morton Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Link: http://lkml.kernel.org/r/1587479661-27237-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4bb922c02521..2243230befd2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1314,7 +1314,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) if (do_memsw_account()) { count = page_counter_read(&memcg->memsw); limit = READ_ONCE(memcg->memsw.max); - if (count <= limit) + if (count < limit) margin = min(margin, limit - count); else margin = 0; -- cgit v1.2.3 From 54b512e96d44664d0cdb75b4f6db6b821535dbf8 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 1 Jun 2020 21:49:39 -0700 Subject: memcg: expose root cgroup's memory.stat One way to measure the efficiency of memory reclaim is to look at the ratio (pgscan+pfrefill)/pgsteal. However at the moment these stats are not updated consistently at the system level and the ratio of these are not very meaningful. The pgsteal and pgscan are updated for only global reclaim while pgrefill gets updated for global as well as cgroup reclaim. Please note that this difference is only for system level vmstats. The cgroup stats returned by memory.stat are actually consistent. The cgroup's pgsteal contains number of reclaimed pages for global as well as cgroup reclaim. So, one way to get the system level stats is to get these stats from root's memory.stat, so, expose memory.stat for the root cgroup. From Johannes Weiner: There are subtle differences between /proc/vmstat and memory.stat, and cgroup-aware code that wants to watch the full hierarchy currently has to know about these intricacies and translate semantics back and forth. Generally having the fully recursive memory.stat at the root level could help a broader range of usecases. Why not fix the stats by including both the global and cgroup reclaim activity instead of exposing root cgroup's memory.stat? The reason is the benefit of having metrics exposing the activity that happens purely due to machine capacity rather than localized activity that happens due to the limits throughout the cgroup tree. Additionally there are userspace tools like sysstat(sar) which reads these stats to inform about the system level reclaim activity. So, we should not break such use-cases. Suggested-by: Johannes Weiner Signed-off-by: Shakeel Butt Signed-off-by: Andrew Morton Acked-by: Johannes Weiner Acked-by: Yafang Shao Acked-by: Chris Down Cc: Mel Gorman Cc: Roman Gushchin Cc: Michal Hocko Link: http://lkml.kernel.org/r/20200508170630.94406-1-shakeelb@google.com Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2243230befd2..d5bbd920398e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6228,7 +6228,6 @@ static struct cftype memory_files[] = { }, { .name = "stat", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = memory_stat_show, }, { -- cgit v1.2.3 From 8a5dbc657e1233c907f84f2c6192d3a3cf0026b3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 1 Jun 2020 21:49:42 -0700 Subject: mm/memcg: prepare for swap over-high accounting and penalty calculation Patch series "memcg: Slow down swap allocation as the available space gets depleted", v6. Tejun describes the problem as follows: When swap runs out, there's an abrupt change in system behavior - the anonymous memory suddenly becomes unmanageable which readily breaks any sort of memory isolation and can bring down the whole system. To avoid that, oomd [1] monitors free swap space and triggers kills when it drops below the specific threshold (e.g. 15%). While this works, it's far from ideal: - Depending on IO performance and total swap size, a given headroom might not be enough or too much. - oomd has to monitor swap depletion in addition to the usual pressure metrics and it currently doesn't consider memory.swap.max. Solve this by adapting parts of the approach that memory.high uses - slow down allocation as the resource gets depleted turning the depletion behavior from abrupt cliff one to gradual degradation observable through memory pressure metric. [1] https://github.com/facebookincubator/oomd This patch (of 4): Slice the memory overage calculation logic a little bit so we can reuse it to apply a similar penalty to the swap. The logic which accesses the memory-specific fields (use and high values) has to be taken out of calculate_high_delay(). Signed-off-by: Jakub Kicinski Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Hugh Dickins Cc: Chris Down Cc: Michal Hocko Cc: Tejun Heo Link: http://lkml.kernel.org/r/20200527195846.102707-1-kuba@kernel.org Link: http://lkml.kernel.org/r/20200527195846.102707-2-kuba@kernel.org Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 62 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d5bbd920398e..b0ac90dc3bb0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2321,41 +2321,48 @@ static void high_work_func(struct work_struct *work) #define MEMCG_DELAY_PRECISION_SHIFT 20 #define MEMCG_DELAY_SCALING_SHIFT 14 -/* - * Get the number of jiffies that we should penalise a mischievous cgroup which - * is exceeding its memory.high by checking both it and its ancestors. - */ -static unsigned long calculate_high_delay(struct mem_cgroup *memcg, - unsigned int nr_pages) +static u64 calculate_overage(unsigned long usage, unsigned long high) { - unsigned long penalty_jiffies; - u64 max_overage = 0; - - do { - unsigned long usage, high; - u64 overage; + u64 overage; - usage = page_counter_read(&memcg->memory); - high = READ_ONCE(memcg->high); + if (usage <= high) + return 0; - if (usage <= high) - continue; + /* + * Prevent division by 0 in overage calculation by acting as if + * it was a threshold of 1 page + */ + high = max(high, 1UL); - /* - * Prevent division by 0 in overage calculation by acting as if - * it was a threshold of 1 page - */ - high = max(high, 1UL); + overage = usage - high; + overage <<= MEMCG_DELAY_PRECISION_SHIFT; + return div64_u64(overage, high); +} - overage = usage - high; - overage <<= MEMCG_DELAY_PRECISION_SHIFT; - overage = div64_u64(overage, high); +static u64 mem_find_max_overage(struct mem_cgroup *memcg) +{ + u64 overage, max_overage = 0; - if (overage > max_overage) - max_overage = overage; + do { + overage = calculate_overage(page_counter_read(&memcg->memory), + READ_ONCE(memcg->high)); + max_overage = max(overage, max_overage); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); + return max_overage; +} + +/* + * Get the number of jiffies that we should penalise a mischievous cgroup which + * is exceeding its memory.high by checking both it and its ancestors. + */ +static unsigned long calculate_high_delay(struct mem_cgroup *memcg, + unsigned int nr_pages, + u64 max_overage) +{ + unsigned long penalty_jiffies; + if (!max_overage) return 0; @@ -2411,7 +2418,8 @@ void mem_cgroup_handle_over_high(void) * memory.high is breached and reclaim is unable to keep up. Throttle * allocators proactively to slow down excessive growth. */ - penalty_jiffies = calculate_high_delay(memcg, nr_pages); + penalty_jiffies = calculate_high_delay(memcg, nr_pages, + mem_find_max_overage(memcg)); /* * Don't sleep if the amount of jiffies this memcg owes us is so low -- cgit v1.2.3 From ff144e69f7331eff070f31edf22bc9123d6d6d27 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 1 Jun 2020 21:49:45 -0700 Subject: mm/memcg: move penalty delay clamping out of calculate_high_delay() We will want to call calculate_high_delay() twice - once for memory and once for swap, and we should apply the clamp value to sum of the penalties. Clamping has to be applied outside of calculate_high_delay(). Signed-off-by: Jakub Kicinski Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Chris Down Cc: Hugh Dickins Cc: Michal Hocko Cc: Tejun Heo Link: http://lkml.kernel.org/r/20200527195846.102707-3-kuba@kernel.org Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b0ac90dc3bb0..6a857b87428d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2386,14 +2386,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg, * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or * larger the current charge patch is than that. */ - penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; - - /* - * Clamp the max delay per usermode return so as to still keep the - * application moving forwards and also permit diagnostics, albeit - * extremely slowly. - */ - return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); + return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; } /* @@ -2421,6 +2414,13 @@ void mem_cgroup_handle_over_high(void) penalty_jiffies = calculate_high_delay(memcg, nr_pages, mem_find_max_overage(memcg)); + /* + * Clamp the max delay per usermode return so as to still keep the + * application moving forwards and also permit diagnostics, albeit + * extremely slowly. + */ + penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); + /* * Don't sleep if the amount of jiffies this memcg owes us is so low * that it's not even worth doing, in an attempt to be nice to those who -- cgit v1.2.3 From d1663a907bd348f912b7f7088e83ca1b6fd3309f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 1 Jun 2020 21:49:49 -0700 Subject: mm/memcg: move cgroup high memory limit setting into struct page_counter High memory limit is currently recorded directly in struct mem_cgroup. We are about to add a high limit for swap, move the field to struct page_counter and add some helpers. Signed-off-by: Jakub Kicinski Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Chris Down Cc: Hugh Dickins Cc: Michal Hocko Cc: Tejun Heo Link: http://lkml.kernel.org/r/20200527195846.102707-4-kuba@kernel.org Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 --- include/linux/page_counter.h | 8 ++++++++ mm/memcontrol.c | 19 +++++++++++-------- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 977edd3b7bd8..95a09a7ec412 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -215,9 +215,6 @@ struct mem_cgroup { struct page_counter kmem; struct page_counter tcpmem; - /* Upper bound of normal memory consumption range */ - unsigned long high; - /* Range enforcement for interrupt charges */ struct work_struct high_work; diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index bab7e57f659b..85bd413e784e 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -10,6 +10,7 @@ struct page_counter { atomic_long_t usage; unsigned long min; unsigned long low; + unsigned long high; unsigned long max; struct page_counter *parent; @@ -55,6 +56,13 @@ bool page_counter_try_charge(struct page_counter *counter, void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); + +static inline void page_counter_set_high(struct page_counter *counter, + unsigned long nr_pages) +{ + WRITE_ONCE(counter->high, nr_pages); +} + int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6a857b87428d..08cf17b186fb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2252,7 +2252,8 @@ static void reclaim_high(struct mem_cgroup *memcg, gfp_t gfp_mask) { do { - if (page_counter_read(&memcg->memory) <= READ_ONCE(memcg->high)) + if (page_counter_read(&memcg->memory) <= + READ_ONCE(memcg->memory.high)) continue; memcg_memory_event(memcg, MEMCG_HIGH); try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); @@ -2345,7 +2346,7 @@ static u64 mem_find_max_overage(struct mem_cgroup *memcg) do { overage = calculate_overage(page_counter_read(&memcg->memory), - READ_ONCE(memcg->high)); + READ_ONCE(memcg->memory.high)); max_overage = max(overage, max_overage); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2604,7 +2605,8 @@ done_restock: * reclaim, the cost of mismatch is negligible. */ do { - if (page_counter_read(&memcg->memory) > READ_ONCE(memcg->high)) { + if (page_counter_read(&memcg->memory) > + READ_ONCE(memcg->memory.high)) { /* Don't bother a random interrupted task */ if (in_interrupt()) { schedule_work(&memcg->high_work); @@ -4347,7 +4349,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, while ((parent = parent_mem_cgroup(memcg))) { unsigned long ceiling = min(READ_ONCE(memcg->memory.max), - READ_ONCE(memcg->high)); + READ_ONCE(memcg->memory.high)); unsigned long used = page_counter_read(&memcg->memory); *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); @@ -5072,7 +5074,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(memcg)) return ERR_CAST(memcg); - WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX); + page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent); @@ -5225,7 +5227,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); - WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX); + page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; memcg_wb_domain_size_changed(memcg); } @@ -6024,7 +6026,8 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, static int memory_high_show(struct seq_file *m, void *v) { - return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high)); + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.high)); } static ssize_t memory_high_write(struct kernfs_open_file *of, @@ -6041,7 +6044,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, if (err) return err; - WRITE_ONCE(memcg->high, high); + page_counter_set_high(&memcg->memory, high); for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); -- cgit v1.2.3 From 4b82ab4f28836646eca12cb37f408568d3cdc5c3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 1 Jun 2020 21:49:52 -0700 Subject: mm/memcg: automatically penalize tasks with high swap use Add a memory.swap.high knob, which can be used to protect the system from SWAP exhaustion. The mechanism used for penalizing is similar to memory.high penalty (sleep on return to user space). That is not to say that the knob itself is equivalent to memory.high. The objective is more to protect the system from potentially buggy tasks consuming a lot of swap and impacting other tasks, or even bringing the whole system to stand still with complete SWAP exhaustion. Hopefully without the need to find per-task hard limits. Slowing misbehaving tasks down gradually allows user space oom killers or other protection mechanisms to react. oomd and earlyoom already do killing based on swap exhaustion, and memory.swap.high protection will help implement such userspace oom policies more reliably. We can use one counter for number of pages allocated under pressure to save struct task space and avoid two separate hierarchy walks on the hot path. The exact overage is calculated on return to user space, anyway. Take the new high limit into account when determining if swap is "full". Borrowing the explanation from Johannes: The idea behind "swap full" is that as long as the workload has plenty of swap space available and it's not changing its memory contents, it makes sense to generously hold on to copies of data in the swap device, even after the swapin. A later reclaim cycle can drop the page without any IO. Trading disk space for IO. But the only two ways to reclaim a swap slot is when they're faulted in and the references go away, or by scanning the virtual address space like swapoff does - which is very expensive (one could argue it's too expensive even for swapoff, it's often more practical to just reboot). So at some point in the fill level, we have to start freeing up swap slots on fault/swapin. Otherwise we could eventually run out of swap slots while they're filled with copies of data that is also in RAM. We don't want to OOM a workload because its available swap space is filled with redundant cache. Signed-off-by: Jakub Kicinski Signed-off-by: Andrew Morton Acked-by: Johannes Weiner Cc: Tejun Heo Cc: Chris Down Cc: Shakeel Butt Cc: Michal Hocko Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200527195846.102707-5-kuba@kernel.org Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 20 ++++++++ include/linux/memcontrol.h | 1 + mm/memcontrol.c | 88 ++++++++++++++++++++++++++++++--- 3 files changed, 102 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 5f12f203822e..b8c0460730f3 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1374,6 +1374,22 @@ PAGE_SIZE multiple when read back. The total amount of swap currently being used by the cgroup and its descendants. + memory.swap.high + A read-write single value file which exists on non-root + cgroups. The default is "max". + + Swap usage throttle limit. If a cgroup's swap usage exceeds + this limit, all its further allocations will be throttled to + allow userspace to implement custom out-of-memory procedures. + + This limit marks a point of no return for the cgroup. It is NOT + designed to manage the amount of swapping a workload does + during regular operation. Compare to memory.swap.max, which + prohibits swapping past a set amount, but lets the cgroup + continue unimpeded as long as other memory can be reclaimed. + + Healthy workloads are not expected to reach this limit. + memory.swap.max A read-write single value file which exists on non-root cgroups. The default is "max". @@ -1387,6 +1403,10 @@ PAGE_SIZE multiple when read back. otherwise, a value change in this file generates a file modified event. + high + The number of times the cgroup's swap usage was over + the high threshold. + max The number of times the cgroup's swap usage was about to go over the max boundary and swap allocation diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 95a09a7ec412..bfe9533bb67e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -45,6 +45,7 @@ enum memcg_memory_event { MEMCG_MAX, MEMCG_OOM, MEMCG_OOM_KILL, + MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, MEMCG_NR_MEMORY_EVENTS, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 08cf17b186fb..f3087e22dfa9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2354,6 +2354,22 @@ static u64 mem_find_max_overage(struct mem_cgroup *memcg) return max_overage; } +static u64 swap_find_max_overage(struct mem_cgroup *memcg) +{ + u64 overage, max_overage = 0; + + do { + overage = calculate_overage(page_counter_read(&memcg->swap), + READ_ONCE(memcg->swap.high)); + if (overage) + memcg_memory_event(memcg, MEMCG_SWAP_HIGH); + max_overage = max(overage, max_overage); + } while ((memcg = parent_mem_cgroup(memcg)) && + !mem_cgroup_is_root(memcg)); + + return max_overage; +} + /* * Get the number of jiffies that we should penalise a mischievous cgroup which * is exceeding its memory.high by checking both it and its ancestors. @@ -2415,6 +2431,9 @@ void mem_cgroup_handle_over_high(void) penalty_jiffies = calculate_high_delay(memcg, nr_pages, mem_find_max_overage(memcg)); + penalty_jiffies += calculate_high_delay(memcg, nr_pages, + swap_find_max_overage(memcg)); + /* * Clamp the max delay per usermode return so as to still keep the * application moving forwards and also permit diagnostics, albeit @@ -2605,13 +2624,32 @@ done_restock: * reclaim, the cost of mismatch is negligible. */ do { - if (page_counter_read(&memcg->memory) > - READ_ONCE(memcg->memory.high)) { - /* Don't bother a random interrupted task */ - if (in_interrupt()) { + bool mem_high, swap_high; + + mem_high = page_counter_read(&memcg->memory) > + READ_ONCE(memcg->memory.high); + swap_high = page_counter_read(&memcg->swap) > + READ_ONCE(memcg->swap.high); + + /* Don't bother a random interrupted task */ + if (in_interrupt()) { + if (mem_high) { schedule_work(&memcg->high_work); break; } + continue; + } + + if (mem_high || swap_high) { + /* + * The allocating tasks in this cgroup will need to do + * reclaim or be throttled to prevent further growth + * of the memory or swap footprints. + * + * Target some best-effort fairness between the tasks, + * and distribute reclaim work and delay penalties + * based on how much each task is actually allocating. + */ current->memcg_nr_pages_over_high += batch; set_notify_resume(current); break; @@ -5076,6 +5114,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent); memcg->oom_kill_disable = parent->oom_kill_disable; @@ -5229,6 +5268,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_low(&memcg->memory, 0); page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); memcg_wb_domain_size_changed(memcg); } @@ -7142,10 +7182,13 @@ bool mem_cgroup_swap_full(struct page *page) if (!memcg) return false; - for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) - if (page_counter_read(&memcg->swap) * 2 >= - READ_ONCE(memcg->swap.max)) + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { + unsigned long usage = page_counter_read(&memcg->swap); + + if (usage * 2 >= READ_ONCE(memcg->swap.high) || + usage * 2 >= READ_ONCE(memcg->swap.max)) return true; + } return false; } @@ -7175,6 +7218,29 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; } +static int swap_high_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); +} + +static ssize_t swap_high_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long high; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &high); + if (err) + return err; + + page_counter_set_high(&memcg->swap, high); + + return nbytes; +} + static int swap_max_show(struct seq_file *m, void *v) { return seq_puts_memcg_tunable(m, @@ -7202,6 +7268,8 @@ static int swap_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + seq_printf(m, "high %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); seq_printf(m, "max %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); seq_printf(m, "fail %lu\n", @@ -7216,6 +7284,12 @@ static struct cftype swap_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = swap_current_read, }, + { + .name = "swap.high", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = swap_high_show, + .write = swap_high_write, + }, { .name = "swap.max", .flags = CFTYPE_NOT_ON_ROOT, -- cgit v1.2.3 From 50d53d7c724330a0dc4df26c45de2a9a886c5d88 Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Mon, 1 Jun 2020 21:49:55 -0700 Subject: memcg: fix memcg_kmem_bypass() for remote memcg charging While trying to use remote memcg charging in an out-of-tree kernel module I found it's not working, because the current thread is a workqueue thread. As we will probably encounter this issue in the future as the users of memalloc_use_memcg() grow, and it's nothing wrong for this usage, it's better we fix it now. Signed-off-by: Zefan Li Signed-off-by: Andrew Morton Reviewed-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Link: http://lkml.kernel.org/r/1d202a12-26fe-0012-ea14-f025ddcd044a@huawei.com Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f3087e22dfa9..f973a025569b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2852,7 +2852,12 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, static inline bool memcg_kmem_bypass(void) { - if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) + if (in_interrupt()) + return true; + + /* Allow remote memcg charging in kthread contexts. */ + if ((!current->mm || (current->flags & PF_KTHREAD)) && + !current->active_memcg) return true; return false; } -- cgit v1.2.3 From 1494e0c38ee903e83aefb58caf54a9217273d49a Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 1 Jun 2020 21:49:58 -0700 Subject: x86: mm: ptdump: calculate effective permissions correctly Patch series "Fix W+X debug feature on x86" Jan alerted me[1] that the W+X detection debug feature was broken in x86 by my change[2] to switch x86 to use the generic ptdump infrastructure. Fundamentally the approach of trying to move the calculation of effective permissions into note_page() was broken because note_page() is only called for 'leaf' entries and the effective permissions are passed down via the internal nodes of the page tree. The solution I've taken here is to create a new (optional) callback which is called for all nodes of the page tree and therefore can calculate the effective permissions. Secondly on some configurations (32 bit with PAE) "unsigned long" is not large enough to store the table entries. The fix here is simple - let's just use a u64. [1] https://lore.kernel.org/lkml/d573dc7e-e742-84de-473d-f971142fa319@suse.com/ [2] 2ae27137b2db ("x86: mm: convert dump_pagetables to use walk_page_range") This patch (of 2): By switching the x86 page table dump code to use the generic code the effective permissions are no longer calculated correctly because the note_page() function is only called for *leaf* entries. To calculate the actual effective permissions it is necessary to observe the full hierarchy of the page tree. Introduce a new callback for ptdump which is called for every entry and can therefore update the prot_levels array correctly. note_page() can then simply access the appropriate element in the array. [steven.price@arm.com: make the assignment conditional on val != 0] Link: http://lkml.kernel.org/r/430c8ab4-e7cd-6933-dde6-087fac6db872@arm.com Fixes: 2ae27137b2db ("x86: mm: convert dump_pagetables to use walk_page_range") Reported-by: Jan Beulich Signed-off-by: Steven Price Signed-off-by: Andrew Morton Cc: Qian Cai Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Link: http://lkml.kernel.org/r/20200521152308.33096-1-steven.price@arm.com Link: http://lkml.kernel.org/r/20200521152308.33096-2-steven.price@arm.com Signed-off-by: Linus Torvalds --- arch/x86/mm/dump_pagetables.c | 33 ++++++++++++++++++++------------- include/linux/ptdump.h | 1 + mm/ptdump.c | 17 ++++++++++++++++- 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 69309cd56fdf..33093fdedb02 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -249,10 +249,22 @@ static void note_wx(struct pg_state *st, unsigned long addr) (void *)st->start_address); } -static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2) +static void effective_prot(struct ptdump_state *pt_st, int level, u64 val) { - return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) | - ((prot1 | prot2) & _PAGE_NX); + struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); + pgprotval_t prot = val & PTE_FLAGS_MASK; + pgprotval_t effective; + + if (level > 0) { + pgprotval_t higher_prot = st->prot_levels[level - 1]; + + effective = (higher_prot & prot & (_PAGE_USER | _PAGE_RW)) | + ((higher_prot | prot) & _PAGE_NX); + } else { + effective = prot; + } + + st->prot_levels[level] = effective; } /* @@ -270,16 +282,10 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, struct seq_file *m = st->seq; new_prot = val & PTE_FLAGS_MASK; - - if (level > 0) { - new_eff = effective_prot(st->prot_levels[level - 1], - new_prot); - } else { - new_eff = new_prot; - } - - if (level >= 0) - st->prot_levels[level] = new_eff; + if (!val) + new_eff = 0; + else + new_eff = st->prot_levels[level]; /* * If we have a "break" in the series, we need to flush the state that @@ -374,6 +380,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, struct pg_state st = { .ptdump = { .note_page = note_page, + .effective_prot = effective_prot, .range = ptdump_ranges }, .level = -1, diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index a67065c403c3..ac01502763bf 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -14,6 +14,7 @@ struct ptdump_state { /* level is 0:PGD to 4:PTE, or -1 if unknown */ void (*note_page)(struct ptdump_state *st, unsigned long addr, int level, unsigned long val); + void (*effective_prot)(struct ptdump_state *st, int level, u64 val); const struct ptdump_range *range; }; diff --git a/mm/ptdump.c b/mm/ptdump.c index 26208d0d03b7..f4ce916f5602 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -36,6 +36,9 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 0, pgd_val(val)); + if (pgd_leaf(val)) st->note_page(st, addr, 0, pgd_val(val)); @@ -53,6 +56,9 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 1, p4d_val(val)); + if (p4d_leaf(val)) st->note_page(st, addr, 1, p4d_val(val)); @@ -70,6 +76,9 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 2, pud_val(val)); + if (pud_leaf(val)) st->note_page(st, addr, 2, pud_val(val)); @@ -87,6 +96,8 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, return note_kasan_page_table(walk, addr); #endif + if (st->effective_prot) + st->effective_prot(st, 3, pmd_val(val)); if (pmd_leaf(val)) st->note_page(st, addr, 3, pmd_val(val)); @@ -97,8 +108,12 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct ptdump_state *st = walk->private; + pte_t val = READ_ONCE(*pte); + + if (st->effective_prot) + st->effective_prot(st, 4, pte_val(val)); - st->note_page(st, addr, 4, pte_val(READ_ONCE(*pte))); + st->note_page(st, addr, 4, pte_val(val)); return 0; } -- cgit v1.2.3 From 99395ee3f7b4accc3a16a6aa4c2abb3774fc33ca Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 1 Jun 2020 21:50:01 -0700 Subject: mm: ptdump: expand type of 'val' in note_page() The page table entry is passed in the 'val' argument to note_page(), however this was previously an "unsigned long" which is fine on 64-bit platforms. But for 32 bit x86 it is not always big enough to contain a page table entry which may be 64 bits. Change the type to u64 to ensure that it is always big enough. [akpm@linux-foundation.org: fix riscv] Reported-by: Jan Beulich Signed-off-by: Steven Price Signed-off-by: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Link: http://lkml.kernel.org/r/20200521152308.33096-3-steven.price@arm.com Signed-off-by: Linus Torvalds --- arch/arm64/mm/dump.c | 2 +- arch/riscv/mm/ptdump.c | 2 +- arch/x86/mm/dump_pagetables.c | 2 +- include/linux/ptdump.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 860c00ec8bd3..d4313bc0c4c1 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -247,7 +247,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) } static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, - unsigned long val) + u64 val) { struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); static const char units[] = "KMGTPE"; diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c index 7eab76a93106..070505d79b06 100644 --- a/arch/riscv/mm/ptdump.c +++ b/arch/riscv/mm/ptdump.c @@ -204,7 +204,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) } static void note_page(struct ptdump_state *pt_st, unsigned long addr, - int level, unsigned long val) + int level, u64 val) { struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); u64 pa = PFN_PHYS(pte_pfn(__pte(val))); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 33093fdedb02..ea9010113f69 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -273,7 +273,7 @@ static void effective_prot(struct ptdump_state *pt_st, int level, u64 val) * print what we collected so far. */ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, - unsigned long val) + u64 val) { struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); pgprotval_t new_prot, new_eff; diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index ac01502763bf..2a3a95586425 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -13,7 +13,7 @@ struct ptdump_range { struct ptdump_state { /* level is 0:PGD to 4:PTE, or -1 if unknown */ void (*note_page)(struct ptdump_state *st, unsigned long addr, - int level, unsigned long val); + int level, u64 val); void (*effective_prot)(struct ptdump_state *st, int level, u64 val); const struct ptdump_range *range; }; -- cgit v1.2.3 From c94b6923fa0a954fbe14ea0168b9d324011fd817 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 1 Jun 2020 21:50:05 -0700 Subject: /proc/PID/smaps: Add PMD migration entry parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now, when reading /proc/PID/smaps, the PMD migration entry in page table is simply ignored. To improve the accuracy of /proc/PID/smaps, its parsing and processing is added. To test the patch, we run pmbench to eat 400 MB memory in background, then run /usr/bin/migratepages and `cat /proc/PID/smaps` every second. The issue as follows can be reproduced within 60 seconds. Before the patch, for the fully populated 400 MB anonymous VMA, some THP pages under migration may be lost as below. 7f3f6a7e5000-7f3f837e5000 rw-p 00000000 00:00 0 Size: 409600 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Rss: 407552 kB Pss: 407552 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 407552 kB Referenced: 301056 kB Anonymous: 407552 kB LazyFree: 0 kB AnonHugePages: 405504 kB ShmemPmdMapped: 0 kB FilePmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB Locked: 0 kB THPeligible: 1 VmFlags: rd wr mr mw me ac After the patch, it will be always, 7f3f6a7e5000-7f3f837e5000 rw-p 00000000 00:00 0 Size: 409600 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Rss: 409600 kB Pss: 409600 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 409600 kB Referenced: 294912 kB Anonymous: 409600 kB LazyFree: 0 kB AnonHugePages: 407552 kB ShmemPmdMapped: 0 kB FilePmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB Locked: 0 kB THPeligible: 1 VmFlags: rd wr mr mw me ac Signed-off-by: "Huang, Ying" Signed-off-by: Andrew Morton Reviewed-by: Zi Yan Acked-by: Michal Hocko Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: Alexey Dobriyan Cc: Konstantin Khlebnikov Cc: "Jérôme Glisse" Cc: Yang Shi Link: http://lkml.kernel.org/r/20200403123059.1846960-1-ying.huang@intel.com Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8d382d4ec067..36dc7417c0df 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -546,10 +546,17 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); - struct page *page; + struct page *page = NULL; + + if (pmd_present(*pmd)) { + /* FOLL_DUMP will return -EFAULT on huge zero page */ + page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { + swp_entry_t entry = pmd_to_swp_entry(*pmd); - /* FOLL_DUMP will return -EFAULT on huge zero page */ - page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + if (is_migration_entry(entry)) + page = migration_entry_to_page(entry); + } if (IS_ERR_OR_NULL(page)) return; if (PageAnon(page)) @@ -578,8 +585,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { - if (pmd_present(*pmd)) - smaps_pmd_entry(pmd, addr, walk); + smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); goto out; } -- cgit v1.2.3 From 6972f55c413fedb7d5701a65e751ea6f10d1e0a5 Mon Sep 17 00:00:00 2001 From: chenqiwu Date: Mon, 1 Jun 2020 21:50:08 -0700 Subject: mm/memory: remove unnecessary pte_devmap case in copy_one_pte() Since commit 25b2995a35b6 ("mm: remove MEMORY_DEVICE_PUBLIC support"), the assignment to 'page' for pte_devmap case has been unnecessary. Let's remove it. [willy@infradead.org: changelog] Signed-off-by: chenqiwu Signed-off-by: Andrew Morton Reviewed-by: Matthew Wilcox Acked-by: Michal Hocko Link: http://lkml.kernel.org/r/1587349685-31712-1-git-send-email-qiwuchen55@gmail.com Signed-off-by: Linus Torvalds --- mm/memory.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index f703fe8c8346..21438278afca 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -802,8 +802,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(page); page_dup_rmap(page, false); rss[mm_counter(page)]++; - } else if (pte_devmap(pte)) { - page = pte_page(pte); } out_set_pte: -- cgit v1.2.3 From 872e9a205c8491daf1a51ea3733c8c1d15d51e10 Mon Sep 17 00:00:00 2001 From: Wetp Zhang Date: Mon, 1 Jun 2020 21:50:11 -0700 Subject: mm, memory_failure: don't send BUS_MCEERR_AO for action required error Some processes dont't want to be killed early, but in "Action Required" case, those also may be killed by BUS_MCEERR_AO when sharing memory with other which is accessing the fail memory. And sending SIGBUS with BUS_MCEERR_AO for action required error is strange, so ignore the non-current processes here. Suggested-by: Naoya Horiguchi Signed-off-by: Wetp Zhang Signed-off-by: Andrew Morton Acked-by: Naoya Horiguchi Acked-by: Pankaj Gupta Link: http://lkml.kernel.org/r/1590817116-21281-1-git-send-email-wetp.zy@linux.alibaba.com Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a96364be8ab4..dd3862fcf2e9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -210,14 +210,17 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) { struct task_struct *t = tk->tsk; short addr_lsb = tk->size_shift; - int ret; + int ret = 0; - pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", - pfn, t->comm, t->pid); + if ((t->mm == current->mm) || !(flags & MF_ACTION_REQUIRED)) + pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", + pfn, t->comm, t->pid); - if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { - ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr, - addr_lsb); + if (flags & MF_ACTION_REQUIRED) { + if (t->mm == current->mm) + ret = force_sig_mceerr(BUS_MCEERR_AR, + (void __user *)tk->addr, addr_lsb); + /* send no signal to non-current processes */ } else { /* * Don't use force here, it's convenient if the signal -- cgit v1.2.3 From 78bb17f76edc3959152334947deec4dcb56e3764 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:50:15 -0700 Subject: x86/hyperv: use vmalloc_exec for the hypercall page Patch series "decruft the vmalloc API", v2. Peter noticed that with some dumb luck you can toast the kernel address space with exported vmalloc symbols. I used this as an opportunity to decruft the vmalloc.c API and make it much more systematic. This also removes any chance to create vmalloc mappings outside the designated areas or using executable permissions from modules. Besides that it removes more than 300 lines of code. This patch (of 29): Use the designated helper for allocating executable kernel memory, and remove the now unused PAGE_KERNEL_RX define. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Michael Kelley Acked-by: Wei Liu Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Catalin Marinas Cc: Will Deacon Cc: Heiko Carstens Cc: Vasily Gorbik Link: http://lkml.kernel.org/r/20200414131348.444715-1-hch@lst.de Link: http://lkml.kernel.org/r/20200414131348.444715-2-hch@lst.de Signed-off-by: Linus Torvalds --- arch/x86/hyperv/hv_init.c | 2 +- arch/x86/include/asm/pgtable_types.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index acf76b466db6..697ddd2afef9 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -379,7 +379,7 @@ void __init hyperv_init(void) guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0); wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); - hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); + hv_hypercall_pg = vmalloc_exec(PAGE_SIZE); if (hv_hypercall_pg == NULL) { wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); goto remove_cpuhp_state; diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index b6606fe6cfdf..947867f112ea 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -194,7 +194,6 @@ enum page_cache_mode { #define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0) #define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC) #define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX|___D| 0|___G) -#define __PAGE_KERNEL_RX (__PP| 0| 0|___A| 0|___D| 0|___G) #define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC) #define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX|___D| 0|___G) #define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G) @@ -220,7 +219,6 @@ enum page_cache_mode { #define PAGE_KERNEL_RO __pgprot_mask(__PAGE_KERNEL_RO | _ENC) #define PAGE_KERNEL_EXEC __pgprot_mask(__PAGE_KERNEL_EXEC | _ENC) #define PAGE_KERNEL_EXEC_NOENC __pgprot_mask(__PAGE_KERNEL_EXEC | 0) -#define PAGE_KERNEL_RX __pgprot_mask(__PAGE_KERNEL_RX | _ENC) #define PAGE_KERNEL_NOCACHE __pgprot_mask(__PAGE_KERNEL_NOCACHE | _ENC) #define PAGE_KERNEL_LARGE __pgprot_mask(__PAGE_KERNEL_LARGE | _ENC) #define PAGE_KERNEL_LARGE_EXEC __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC) -- cgit v1.2.3 From 0348801151b5aefbcf9d6e9b9e30aceb3a2a7b13 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:50:19 -0700 Subject: x86: fix vmap arguments in map_irq_stack vmap does not take a gfp_t, the flags argument is for VM_* flags. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Peter Zijlstra (Intel) Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-3-hch@lst.de Signed-off-by: Linus Torvalds --- arch/x86/kernel/irq_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 12df3a4abfdd..6b32ab009c19 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -43,7 +43,7 @@ static int map_irq_stack(unsigned int cpu) pages[i] = pfn_to_page(pa >> PAGE_SHIFT); } - va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); + va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, VM_MAP, PAGE_KERNEL); if (!va) return -ENOMEM; -- cgit v1.2.3 From 5bf9917452112694b2c774465ee4dbe441c84b77 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:50:23 -0700 Subject: staging: android: ion: use vmap instead of vm_map_ram vm_map_ram can keep mappings around after the vm_unmap_ram. Using that with non-PAGE_KERNEL mappings can lead to all kinds of aliasing issues. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Greg Kroah-Hartman Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-4-hch@lst.de Signed-off-by: Linus Torvalds --- drivers/staging/android/ion/ion_heap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/staging/android/ion/ion_heap.c b/drivers/staging/android/ion/ion_heap.c index 473b465724f1..0755b11348ed 100644 --- a/drivers/staging/android/ion/ion_heap.c +++ b/drivers/staging/android/ion/ion_heap.c @@ -99,12 +99,12 @@ int ion_heap_map_user(struct ion_heap *heap, struct ion_buffer *buffer, static int ion_heap_clear_pages(struct page **pages, int num, pgprot_t pgprot) { - void *addr = vm_map_ram(pages, num, -1, pgprot); + void *addr = vmap(pages, num, VM_MAP, pgprot); if (!addr) return -ENOMEM; memset(addr, 0, PAGE_SIZE * num); - vm_unmap_ram(addr, num); + vunmap(addr); return 0; } -- cgit v1.2.3 From f8092aa1752749b6d8b357050e840f76695e2361 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:50:28 -0700 Subject: staging: media: ipu3: use vmap instead of reimplementing it Just use vmap instead of messing with vmalloc internals. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-5-hch@lst.de Signed-off-by: Linus Torvalds --- drivers/staging/media/ipu3/ipu3-css-pool.h | 4 +--- drivers/staging/media/ipu3/ipu3-dmamap.c | 30 ++++++++---------------------- 2 files changed, 9 insertions(+), 25 deletions(-) diff --git a/drivers/staging/media/ipu3/ipu3-css-pool.h b/drivers/staging/media/ipu3/ipu3-css-pool.h index f4a60b41401b..a8ccd4f70320 100644 --- a/drivers/staging/media/ipu3/ipu3-css-pool.h +++ b/drivers/staging/media/ipu3/ipu3-css-pool.h @@ -15,14 +15,12 @@ struct imgu_device; * @size: size of the buffer in bytes. * @vaddr: kernel virtual address. * @daddr: iova dma address to access IPU3. - * @vma: private, a pointer to &struct vm_struct, - * used for imgu_dmamap_free. */ struct imgu_css_map { size_t size; void *vaddr; dma_addr_t daddr; - struct vm_struct *vma; + struct page **pages; }; /** diff --git a/drivers/staging/media/ipu3/ipu3-dmamap.c b/drivers/staging/media/ipu3/ipu3-dmamap.c index 7431322379f6..8a19b0024152 100644 --- a/drivers/staging/media/ipu3/ipu3-dmamap.c +++ b/drivers/staging/media/ipu3/ipu3-dmamap.c @@ -96,6 +96,7 @@ void *imgu_dmamap_alloc(struct imgu_device *imgu, struct imgu_css_map *map, unsigned long shift = iova_shift(&imgu->iova_domain); struct device *dev = &imgu->pci_dev->dev; size_t size = PAGE_ALIGN(len); + int count = size >> PAGE_SHIFT; struct page **pages; dma_addr_t iovaddr; struct iova *iova; @@ -114,7 +115,7 @@ void *imgu_dmamap_alloc(struct imgu_device *imgu, struct imgu_css_map *map, /* Call IOMMU driver to setup pgt */ iovaddr = iova_dma_addr(&imgu->iova_domain, iova); - for (i = 0; i < size / PAGE_SIZE; ++i) { + for (i = 0; i < count; ++i) { rval = imgu_mmu_map(imgu->mmu, iovaddr, page_to_phys(pages[i]), PAGE_SIZE); if (rval) @@ -123,33 +124,23 @@ void *imgu_dmamap_alloc(struct imgu_device *imgu, struct imgu_css_map *map, iovaddr += PAGE_SIZE; } - /* Now grab a virtual region */ - map->vma = __get_vm_area(size, VM_USERMAP, VMALLOC_START, VMALLOC_END); - if (!map->vma) + map->vaddr = vmap(pages, count, VM_USERMAP, PAGE_KERNEL); + if (!map->vaddr) goto out_unmap; - map->vma->pages = pages; - /* And map it in KVA */ - if (map_vm_area(map->vma, PAGE_KERNEL, pages)) - goto out_vunmap; - + map->pages = pages; map->size = size; map->daddr = iova_dma_addr(&imgu->iova_domain, iova); - map->vaddr = map->vma->addr; dev_dbg(dev, "%s: allocated %zu @ IOVA %pad @ VA %p\n", __func__, - size, &map->daddr, map->vma->addr); - - return map->vma->addr; + size, &map->daddr, map->vaddr); -out_vunmap: - vunmap(map->vma->addr); + return map->vaddr; out_unmap: imgu_dmamap_free_buffer(pages, size); imgu_mmu_unmap(imgu->mmu, iova_dma_addr(&imgu->iova_domain, iova), i * PAGE_SIZE); - map->vma = NULL; out_free_iova: __free_iova(&imgu->iova_domain, iova); @@ -177,8 +168,6 @@ void imgu_dmamap_unmap(struct imgu_device *imgu, struct imgu_css_map *map) */ void imgu_dmamap_free(struct imgu_device *imgu, struct imgu_css_map *map) { - struct vm_struct *area = map->vma; - dev_dbg(&imgu->pci_dev->dev, "%s: freeing %zu @ IOVA %pad @ VA %p\n", __func__, map->size, &map->daddr, map->vaddr); @@ -187,11 +176,8 @@ void imgu_dmamap_free(struct imgu_device *imgu, struct imgu_css_map *map) imgu_dmamap_unmap(imgu, map); - if (WARN_ON(!area) || WARN_ON(!area->pages)) - return; - - imgu_dmamap_free_buffer(area->pages, map->size); vunmap(map->vaddr); + imgu_dmamap_free_buffer(map->pages, map->size); map->vaddr = NULL; } -- cgit v1.2.3 From 515e5b6d90d410a3b0b433853c367936830a45a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:50:32 -0700 Subject: dma-mapping: use vmap insted of reimplementing it Replace the open coded instance of vmap with the actual function. In the non-contiguous (IOMMU) case this requires an extra find_vm_area, but given that this isn't a fast path function that is a small price to pay. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-6-hch@lst.de Signed-off-by: Linus Torvalds --- kernel/dma/remap.c | 48 ++++++++++++------------------------------------ 1 file changed, 12 insertions(+), 36 deletions(-) diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index d14cbc83986a..914ff5a58dd5 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -20,23 +20,6 @@ struct page **dma_common_find_pages(void *cpu_addr) return area->pages; } -static struct vm_struct *__dma_common_pages_remap(struct page **pages, - size_t size, pgprot_t prot, const void *caller) -{ - struct vm_struct *area; - - area = get_vm_area_caller(size, VM_DMA_COHERENT, caller); - if (!area) - return NULL; - - if (map_vm_area(area, prot, pages)) { - vunmap(area->addr); - return NULL; - } - - return area; -} - /* * Remaps an array of PAGE_SIZE pages into another vm_area. * Cannot be used in non-sleeping contexts @@ -44,15 +27,12 @@ static struct vm_struct *__dma_common_pages_remap(struct page **pages, void *dma_common_pages_remap(struct page **pages, size_t size, pgprot_t prot, const void *caller) { - struct vm_struct *area; + void *vaddr; - area = __dma_common_pages_remap(pages, size, prot, caller); - if (!area) - return NULL; - - area->pages = pages; - - return area->addr; + vaddr = vmap(pages, size >> PAGE_SHIFT, VM_DMA_COHERENT, prot); + if (vaddr) + find_vm_area(vaddr)->pages = pages; + return vaddr; } /* @@ -62,24 +42,20 @@ void *dma_common_pages_remap(struct page **pages, size_t size, void *dma_common_contiguous_remap(struct page *page, size_t size, pgprot_t prot, const void *caller) { - int i; + int count = size >> PAGE_SHIFT; struct page **pages; - struct vm_struct *area; + void *vaddr; + int i; - pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL); + pages = kmalloc_array(count, sizeof(struct page *), GFP_KERNEL); if (!pages) return NULL; - - for (i = 0; i < (size >> PAGE_SHIFT); i++) + for (i = 0; i < count; i++) pages[i] = nth_page(page, i); - - area = __dma_common_pages_remap(pages, size, prot, caller); - + vaddr = vmap(pages, count, VM_DMA_COHERENT, prot); kfree(pages); - if (!area) - return NULL; - return area->addr; + return vaddr; } /* -- cgit v1.2.3 From b274014c6d191ec92c4921d5e84c90f6ed2f38a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:50:37 -0700 Subject: powerpc: add an ioremap_phb helper Factor code shared between pci_64 and electra_cf into a ioremap_pbh helper that follows the normal ioremap semantics, and returns a useful __iomem pointer. Note that it opencodes __ioremap_at as we know from the callers the slab is available. Switch pci_64 to also store the result as __iomem pointer, and unmap the result using iounmap instead of force casting and using vmalloc APIs. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-7-hch@lst.de Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/io.h | 2 ++ arch/powerpc/include/asm/pci-bridge.h | 2 +- arch/powerpc/kernel/pci_64.c | 53 +++++++++++++++++++++++------------ drivers/pcmcia/electra_cf.c | 45 +++++++++++------------------ 4 files changed, 54 insertions(+), 48 deletions(-) diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 635969b5b58e..91320985d33f 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -719,6 +719,8 @@ void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size); extern void iounmap(volatile void __iomem *addr); +void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size); + int early_ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot); void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 69f4cb3b7c56..b92e81b256e5 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -66,7 +66,7 @@ struct pci_controller { void __iomem *io_base_virt; #ifdef CONFIG_PPC64 - void *io_base_alloc; + void __iomem *io_base_alloc; #endif resource_size_t io_base_phys; resource_size_t pci_io_size; diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index f83d1f69b1dd..2a976314f169 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -109,23 +109,46 @@ int pcibios_unmap_io_space(struct pci_bus *bus) /* Get the host bridge */ hose = pci_bus_to_host(bus); - /* Check if we have IOs allocated */ - if (hose->io_base_alloc == NULL) - return 0; - pr_debug("IO unmapping for PHB %pOF\n", hose->dn); pr_debug(" alloc=0x%p\n", hose->io_base_alloc); - /* This is a PHB, we fully unmap the IO area */ - vunmap(hose->io_base_alloc); - + iounmap(hose->io_base_alloc); return 0; } EXPORT_SYMBOL_GPL(pcibios_unmap_io_space); -static int pcibios_map_phb_io_space(struct pci_controller *hose) +void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size) { struct vm_struct *area; + unsigned long addr; + + WARN_ON_ONCE(paddr & ~PAGE_MASK); + WARN_ON_ONCE(size & ~PAGE_MASK); + + /* + * Let's allocate some IO space for that guy. We don't pass VM_IOREMAP + * because we don't care about alignment tricks that the core does in + * that case. Maybe we should due to stupid card with incomplete + * address decoding but I'd rather not deal with those outside of the + * reserved 64K legacy region. + */ + area = __get_vm_area(size, 0, PHB_IO_BASE, PHB_IO_END); + if (!area) + return NULL; + + addr = (unsigned long)area->addr; + if (ioremap_page_range(addr, addr + size, paddr, + pgprot_noncached(PAGE_KERNEL))) { + unmap_kernel_range(addr, size); + return NULL; + } + + return (void __iomem *)addr; +} +EXPORT_SYMBOL_GPL(ioremap_phb); + +static int pcibios_map_phb_io_space(struct pci_controller *hose) +{ unsigned long phys_page; unsigned long size_page; unsigned long io_virt_offset; @@ -146,12 +169,11 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose) * with incomplete address decoding but I'd rather not deal with * those outside of the reserved 64K legacy region. */ - area = __get_vm_area(size_page, 0, PHB_IO_BASE, PHB_IO_END); - if (area == NULL) + hose->io_base_alloc = ioremap_phb(phys_page, size_page); + if (!hose->io_base_alloc) return -ENOMEM; - hose->io_base_alloc = area->addr; - hose->io_base_virt = (void __iomem *)(area->addr + - hose->io_base_phys - phys_page); + hose->io_base_virt = hose->io_base_alloc + + hose->io_base_phys - phys_page; pr_debug("IO mapping for PHB %pOF\n", hose->dn); pr_debug(" phys=0x%016llx, virt=0x%p (alloc=0x%p)\n", @@ -159,11 +181,6 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose) pr_debug(" size=0x%016llx (alloc=0x%016lx)\n", hose->pci_io_size, size_page); - /* Establish the mapping */ - if (__ioremap_at(phys_page, area->addr, size_page, - pgprot_noncached(PAGE_KERNEL)) == NULL) - return -ENOMEM; - /* Fixup hose IO resource */ io_virt_offset = pcibios_io_space_offset(hose); hose->io_resource.start += io_virt_offset; diff --git a/drivers/pcmcia/electra_cf.c b/drivers/pcmcia/electra_cf.c index f2741c04289d..35158cfd9c1a 100644 --- a/drivers/pcmcia/electra_cf.c +++ b/drivers/pcmcia/electra_cf.c @@ -178,10 +178,9 @@ static int electra_cf_probe(struct platform_device *ofdev) struct device_node *np = ofdev->dev.of_node; struct electra_cf_socket *cf; struct resource mem, io; - int status; + int status = -ENOMEM; const unsigned int *prop; int err; - struct vm_struct *area; err = of_address_to_resource(np, 0, &mem); if (err) @@ -202,30 +201,19 @@ static int electra_cf_probe(struct platform_device *ofdev) cf->mem_phys = mem.start; cf->mem_size = PAGE_ALIGN(resource_size(&mem)); cf->mem_base = ioremap(cf->mem_phys, cf->mem_size); + if (!cf->mem_base) + goto out_free_cf; cf->io_size = PAGE_ALIGN(resource_size(&io)); - - area = __get_vm_area(cf->io_size, 0, PHB_IO_BASE, PHB_IO_END); - if (area == NULL) { - status = -ENOMEM; - goto fail1; - } - - cf->io_virt = (void __iomem *)(area->addr); + cf->io_virt = ioremap_phb(io.start, cf->io_size); + if (!cf->io_virt) + goto out_unmap_mem; cf->gpio_base = ioremap(0xfc103000, 0x1000); + if (!cf->gpio_base) + goto out_unmap_virt; dev_set_drvdata(device, cf); - if (!cf->mem_base || !cf->io_virt || !cf->gpio_base || - (__ioremap_at(io.start, cf->io_virt, cf->io_size, - pgprot_noncached(PAGE_KERNEL)) == NULL)) { - dev_err(device, "can't ioremap ranges\n"); - status = -ENOMEM; - goto fail1; - } - - cf->io_base = (unsigned long)cf->io_virt - VMALLOC_END; - cf->iomem.start = (unsigned long)cf->mem_base; cf->iomem.end = (unsigned long)cf->mem_base + (mem.end - mem.start); cf->iomem.flags = IORESOURCE_MEM; @@ -305,14 +293,13 @@ fail1: if (cf->irq) free_irq(cf->irq, cf); - if (cf->io_virt) - __iounmap_at(cf->io_virt, cf->io_size); - if (cf->mem_base) - iounmap(cf->mem_base); - if (cf->gpio_base) - iounmap(cf->gpio_base); - if (area) - device_init_wakeup(&ofdev->dev, 0); + iounmap(cf->gpio_base); +out_unmap_virt: + device_init_wakeup(&ofdev->dev, 0); + iounmap(cf->io_virt); +out_unmap_mem: + iounmap(cf->mem_base); +out_free_cf: kfree(cf); return status; @@ -330,7 +317,7 @@ static int electra_cf_remove(struct platform_device *ofdev) free_irq(cf->irq, cf); del_timer_sync(&cf->timer); - __iounmap_at(cf->io_virt, cf->io_size); + iounmap(cf->io_virt); iounmap(cf->mem_base); iounmap(cf->gpio_base); release_mem_region(cf->mem_phys, cf->mem_size); -- cgit v1.2.3 From 91f03f297c46bd2ced1e73fc9a668292145b1135 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:50:41 -0700 Subject: powerpc: remove __ioremap_at and __iounmap_at These helpers are only used for remapping the ISA I/O base. Replace the mapping side with a remap_isa_range helper in isa-bridge.c that hard codes all the known arguments, and just remove __iounmap_at in favour of open coding it in the only caller. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-8-hch@lst.de Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/io.h | 8 ------- arch/powerpc/kernel/isa-bridge.c | 28 ++++++++++++++++------ arch/powerpc/mm/ioremap_64.c | 50 ---------------------------------------- 3 files changed, 21 insertions(+), 65 deletions(-) diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 91320985d33f..13f90dd03450 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -699,10 +699,6 @@ static inline void iosync(void) * * * iounmap undoes such a mapping and can be hooked * - * * __ioremap_at (and the pending __iounmap_at) are low level functions to - * create hand-made mappings for use only by the PCI code and cannot - * currently be hooked. Must be page aligned. - * * * __ioremap_caller is the same as above but takes an explicit caller * reference rather than using __builtin_return_address(0) * @@ -729,10 +725,6 @@ void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size, pgprot_t prot, void *caller); -extern void __iomem * __ioremap_at(phys_addr_t pa, void *ea, - unsigned long size, pgprot_t prot); -extern void __iounmap_at(void *ea, unsigned long size); - /* * When CONFIG_PPC_INDIRECT_PIO is set, we use the generic iomap implementation * which needs some additional definitions here. They basically allow PIO diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c index 773671b512df..2257d24e6a26 100644 --- a/arch/powerpc/kernel/isa-bridge.c +++ b/arch/powerpc/kernel/isa-bridge.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -38,6 +39,22 @@ EXPORT_SYMBOL_GPL(isa_bridge_pcidev); #define ISA_SPACE_MASK 0x1 #define ISA_SPACE_IO 0x1 +static void remap_isa_base(phys_addr_t pa, unsigned long size) +{ + WARN_ON_ONCE(ISA_IO_BASE & ~PAGE_MASK); + WARN_ON_ONCE(pa & ~PAGE_MASK); + WARN_ON_ONCE(size & ~PAGE_MASK); + + if (slab_is_available()) { + if (ioremap_page_range(ISA_IO_BASE, ISA_IO_BASE + size, pa, + pgprot_noncached(PAGE_KERNEL))) + unmap_kernel_range(ISA_IO_BASE, size); + } else { + early_ioremap_range(ISA_IO_BASE, pa, size, + pgprot_noncached(PAGE_KERNEL)); + } +} + static void pci_process_ISA_OF_ranges(struct device_node *isa_node, unsigned long phb_io_base_phys) { @@ -105,15 +122,13 @@ static void pci_process_ISA_OF_ranges(struct device_node *isa_node, if (size > 0x10000) size = 0x10000; - __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - size, pgprot_noncached(PAGE_KERNEL)); + remap_isa_base(phb_io_base_phys, size); return; inval_range: printk(KERN_ERR "no ISA IO ranges or unexpected isa range, " "mapping 64k\n"); - __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, - 0x10000, pgprot_noncached(PAGE_KERNEL)); + remap_isa_base(phb_io_base_phys, 0x10000); } @@ -248,8 +263,7 @@ void __init isa_bridge_init_non_pci(struct device_node *np) * and map it */ isa_io_base = ISA_IO_BASE; - __ioremap_at(pbase, (void *)ISA_IO_BASE, - size, pgprot_noncached(PAGE_KERNEL)); + remap_isa_base(pbase, size); pr_debug("ISA: Non-PCI bridge is %pOF\n", np); } @@ -297,7 +311,7 @@ static void isa_bridge_remove(void) isa_bridge_pcidev = NULL; /* Unmap the ISA area */ - __iounmap_at((void *)ISA_IO_BASE, 0x10000); + unmap_kernel_range(ISA_IO_BASE, 0x10000); } /** diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c index 50a99d9684f7..ba5cbb0d66bd 100644 --- a/arch/powerpc/mm/ioremap_64.c +++ b/arch/powerpc/mm/ioremap_64.c @@ -4,56 +4,6 @@ #include #include -/** - * Low level function to establish the page tables for an IO mapping - */ -void __iomem *__ioremap_at(phys_addr_t pa, void *ea, unsigned long size, pgprot_t prot) -{ - int ret; - unsigned long va = (unsigned long)ea; - - /* We don't support the 4K PFN hack with ioremap */ - if (pgprot_val(prot) & H_PAGE_4K_PFN) - return NULL; - - if ((ea + size) >= (void *)IOREMAP_END) { - pr_warn("Outside the supported range\n"); - return NULL; - } - - WARN_ON(pa & ~PAGE_MASK); - WARN_ON(((unsigned long)ea) & ~PAGE_MASK); - WARN_ON(size & ~PAGE_MASK); - - if (slab_is_available()) { - ret = ioremap_page_range(va, va + size, pa, prot); - if (ret) - unmap_kernel_range(va, size); - } else { - ret = early_ioremap_range(va, pa, size, prot); - } - - if (ret) - return NULL; - - return (void __iomem *)ea; -} -EXPORT_SYMBOL(__ioremap_at); - -/** - * Low level function to tear down the page tables for an IO mapping. This is - * used for mappings that are manipulated manually, like partial unmapping of - * PCI IOs or ISA space. - */ -void __iounmap_at(void *ea, unsigned long size) -{ - WARN_ON(((unsigned long)ea) & ~PAGE_MASK); - WARN_ON(size & ~PAGE_MASK); - - unmap_kernel_range((unsigned long)ea, size); -} -EXPORT_SYMBOL(__iounmap_at); - void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *caller) { -- cgit v1.2.3 From 4926627793c0a7e7db2bc674e1d06777e86d8dab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:50:45 -0700 Subject: mm: remove __get_vm_area Switch the two remaining callers to use __get_vm_area_caller instead. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-9-hch@lst.de Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/pci_64.c | 3 ++- arch/sh/kernel/cpu/sh4/sq.c | 3 ++- include/linux/vmalloc.h | 2 -- mm/vmalloc.c | 8 -------- 4 files changed, 4 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 2a976314f169..d9ac980c398c 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -132,7 +132,8 @@ void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size) * address decoding but I'd rather not deal with those outside of the * reserved 64K legacy region. */ - area = __get_vm_area(size, 0, PHB_IO_BASE, PHB_IO_END); + area = __get_vm_area_caller(size, 0, PHB_IO_BASE, PHB_IO_END, + __builtin_return_address(0)); if (!area) return NULL; diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c index 934ff84844fa..d432164b23b7 100644 --- a/arch/sh/kernel/cpu/sh4/sq.c +++ b/arch/sh/kernel/cpu/sh4/sq.c @@ -103,7 +103,8 @@ static int __sq_remap(struct sq_mapping *map, pgprot_t prot) #if defined(CONFIG_MMU) struct vm_struct *vma; - vma = __get_vm_area(map->size, VM_ALLOC, map->sq_addr, SQ_ADDRMAX); + vma = __get_vm_area_caller(map->size, VM_ALLOC, map->sq_addr, + SQ_ADDRMAX, __builtin_return_address(0)); if (!vma) return -ENOMEM; diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index a95d3cc74d79..ff69d1e037ca 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -161,8 +161,6 @@ static inline size_t get_vm_area_size(const struct vm_struct *area) extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); extern struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, const void *caller); -extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end); extern struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9a8227afa073..6e9527f131d1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2128,14 +2128,6 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return area; } -struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end) -{ - return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, - GFP_KERNEL, __builtin_return_address(0)); -} -EXPORT_SYMBOL_GPL(__get_vm_area); - struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller) -- cgit v1.2.3 From 8f87cc9386dc7965de151605637eee939ea0d098 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:50:49 -0700 Subject: mm: unexport unmap_kernel_range_noflush There are no modular users of this function. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-10-hch@lst.de Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6e9527f131d1..af60c43c643e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2030,7 +2030,6 @@ void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) { vunmap_page_range(addr, addr + size); } -EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush); /** * unmap_kernel_range - unmap kernel VM area and flush cache and TLB -- cgit v1.2.3 From 8b136018da7bf49b988a24064fc45c290baffd93 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:50:53 -0700 Subject: mm: rename CONFIG_PGTABLE_MAPPING to CONFIG_ZSMALLOC_PGTABLE_MAPPING Rename the Kconfig variable to clarify the scope. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Minchan Kim Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-11-hch@lst.de Signed-off-by: Linus Torvalds --- arch/arm/configs/omap2plus_defconfig | 2 +- include/linux/zsmalloc.h | 2 +- mm/Kconfig | 2 +- mm/zsmalloc.c | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 8b83d4a5d309..fe383f5a92fb 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -81,7 +81,7 @@ CONFIG_PARTITION_ADVANCED=y CONFIG_BINFMT_MISC=y CONFIG_CMA=y CONFIG_ZSMALLOC=m -CONFIG_PGTABLE_MAPPING=y +CONFIG_ZSMALLOC_PGTABLE_MAPPING=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 2219cce81ca4..0fdbf653b173 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -20,7 +20,7 @@ * zsmalloc mapping modes * * NOTE: These only make a difference when a mapped object spans pages. - * They also have no effect when PGTABLE_MAPPING is selected. + * They also have no effect when ZSMALLOC_PGTABLE_MAPPING is selected. */ enum zs_mapmode { ZS_MM_RW, /* normal read-write mapping */ diff --git a/mm/Kconfig b/mm/Kconfig index c1acc34c1c35..09a9edfb8461 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -705,7 +705,7 @@ config ZSMALLOC returned by an alloc(). This handle must be mapped in order to access the allocated space. -config PGTABLE_MAPPING +config ZSMALLOC_PGTABLE_MAPPING bool "Use page table mapping to access object in zsmalloc" depends on ZSMALLOC help diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2f836a2b993f..ac0524330b9b 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -293,7 +293,7 @@ struct zspage { }; struct mapping_area { -#ifdef CONFIG_PGTABLE_MAPPING +#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING struct vm_struct *vm; /* vm area for mapping object that span pages */ #else char *vm_buf; /* copy buffer for objects that span pages */ @@ -1113,7 +1113,7 @@ static struct zspage *find_get_zspage(struct size_class *class) return zspage; } -#ifdef CONFIG_PGTABLE_MAPPING +#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING static inline int __zs_cpu_up(struct mapping_area *area) { /* @@ -1151,7 +1151,7 @@ static inline void __zs_unmap_object(struct mapping_area *area, unmap_kernel_range(addr, PAGE_SIZE * 2); } -#else /* CONFIG_PGTABLE_MAPPING */ +#else /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */ static inline int __zs_cpu_up(struct mapping_area *area) { @@ -1233,7 +1233,7 @@ out: pagefault_enable(); } -#endif /* CONFIG_PGTABLE_MAPPING */ +#endif /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */ static int zs_cpu_prepare(unsigned int cpu) { -- cgit v1.2.3 From b607e6d17db5b91e6a807b4f9a2e849219d720a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:50:58 -0700 Subject: mm: only allow page table mappings for built-in zsmalloc This allows to unexport map_vm_area and unmap_kernel_range, which are rather deep internal and should not be available to modules, as they for example allow fine grained control of mapping permissions, and also allow splitting the setup of a vmalloc area and the actual mapping and thus expose vmalloc internals. zsmalloc is typically built-in and continues to work (just like the percpu-vm code using a similar patter), while modular zsmalloc also continues to work, but must use copies. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-12-hch@lst.de Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- mm/vmalloc.c | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index 09a9edfb8461..5c0362bd8d56 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -707,7 +707,7 @@ config ZSMALLOC config ZSMALLOC_PGTABLE_MAPPING bool "Use page table mapping to access object in zsmalloc" - depends on ZSMALLOC + depends on ZSMALLOC=y help By default, zsmalloc uses a copy-based object mapping method to access allocations that span two pages. However, if a particular diff --git a/mm/vmalloc.c b/mm/vmalloc.c index af60c43c643e..d4c13229ddc2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2047,7 +2047,6 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) vunmap_page_range(addr, end); flush_tlb_kernel_range(addr, end); } -EXPORT_SYMBOL_GPL(unmap_kernel_range); int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) { @@ -2059,7 +2058,6 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) return err > 0 ? 0 : err; } -EXPORT_SYMBOL_GPL(map_vm_area); static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) -- cgit v1.2.3 From 78a0e8c4837f42e9c2b1127e9c450ceeb0efbde6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:02 -0700 Subject: mm: pass addr as unsigned long to vb_free Ever use of addr in vb_free casts to unsigned long first, and the caller has an unsigned long version of the address available anyway. Just pass that and avoid all the casts. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-13-hch@lst.de Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d4c13229ddc2..edb95da697a6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1665,7 +1665,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) return vaddr; } -static void vb_free(const void *addr, unsigned long size) +static void vb_free(unsigned long addr, unsigned long size) { unsigned long offset; unsigned long vb_idx; @@ -1675,24 +1675,22 @@ static void vb_free(const void *addr, unsigned long size) BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); - flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); + flush_cache_vunmap(addr, addr + size); order = get_order(size); - offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); - offset >>= PAGE_SHIFT; + offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; - vb_idx = addr_to_vb_idx((unsigned long)addr); + vb_idx = addr_to_vb_idx(addr); rcu_read_lock(); vb = radix_tree_lookup(&vmap_block_tree, vb_idx); rcu_read_unlock(); BUG_ON(!vb); - vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + vunmap_page_range(addr, addr + size); if (debug_pagealloc_enabled_static()) - flush_tlb_kernel_range((unsigned long)addr, - (unsigned long)addr + size); + flush_tlb_kernel_range(addr, addr + size); spin_lock(&vb->lock); @@ -1792,7 +1790,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) if (likely(count <= VMAP_MAX_ALLOC)) { debug_check_no_locks_freed(mem, size); - vb_free(mem, size); + vb_free(addr, size); return; } -- cgit v1.2.3 From b521c43f58e5234ee9b29817ed5e93523abcffa9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:07 -0700 Subject: mm: remove vmap_page_range_noflush and vunmap_page_range These have non-static aliases called map_kernel_range_noflush and unmap_kernel_range_noflush that just differ slightly in the calling conventions that pass addr + size instead of an end. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-14-hch@lst.de Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 98 +++++++++++++++++++++++++----------------------------------- 1 file changed, 40 insertions(+), 58 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index edb95da697a6..e9970849a103 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -128,10 +128,24 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end) } while (p4d++, addr = next, addr != end); } -static void vunmap_page_range(unsigned long addr, unsigned long end) +/** + * unmap_kernel_range_noflush - unmap kernel VM area + * @addr: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify + * should have been allocated using get_vm_area() and its friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is responsible + * for calling flush_cache_vunmap() on to-be-mapped areas before calling this + * function and flush_tlb_kernel_range() after. + */ +void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) { - pgd_t *pgd; + unsigned long end = addr + size; unsigned long next; + pgd_t *pgd; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); @@ -220,18 +234,30 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, return 0; } -/* - * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and - * will have pfns corresponding to the "pages" array. +/** + * map_kernel_range_noflush - map kernel VM area with the specified pages + * @addr: start of the VM area to map + * @size: size of the VM area to map + * @prot: page protection flags to use + * @pages: pages to map * - * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] + * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should + * have been allocated using get_vm_area() and its friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is responsible for + * calling flush_cache_vmap() on to-be-mapped areas before calling this + * function. + * + * RETURNS: + * The number of pages mapped on success, -errno on failure. */ -static int vmap_page_range_noflush(unsigned long start, unsigned long end, - pgprot_t prot, struct page **pages) +int map_kernel_range_noflush(unsigned long addr, unsigned long size, + pgprot_t prot, struct page **pages) { - pgd_t *pgd; + unsigned long end = addr + size; unsigned long next; - unsigned long addr = start; + pgd_t *pgd; int err = 0; int nr = 0; @@ -252,7 +278,7 @@ static int vmap_page_range(unsigned long start, unsigned long end, { int ret; - ret = vmap_page_range_noflush(start, end, prot, pages); + ret = map_kernel_range_noflush(start, end - start, prot, pages); flush_cache_vmap(start, end); return ret; } @@ -1227,7 +1253,7 @@ EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); */ static void unmap_vmap_area(struct vmap_area *va) { - vunmap_page_range(va->va_start, va->va_end); + unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); } /* @@ -1687,7 +1713,7 @@ static void vb_free(unsigned long addr, unsigned long size) rcu_read_unlock(); BUG_ON(!vb); - vunmap_page_range(addr, addr + size); + unmap_kernel_range_noflush(addr, size); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(addr, addr + size); @@ -1985,50 +2011,6 @@ void __init vmalloc_init(void) vmap_initialized = true; } -/** - * map_kernel_range_noflush - map kernel VM area with the specified pages - * @addr: start of the VM area to map - * @size: size of the VM area to map - * @prot: page protection flags to use - * @pages: pages to map - * - * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size - * specify should have been allocated using get_vm_area() and its - * friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is - * responsible for calling flush_cache_vmap() on to-be-mapped areas - * before calling this function. - * - * RETURNS: - * The number of pages mapped on success, -errno on failure. - */ -int map_kernel_range_noflush(unsigned long addr, unsigned long size, - pgprot_t prot, struct page **pages) -{ - return vmap_page_range_noflush(addr, addr + size, prot, pages); -} - -/** - * unmap_kernel_range_noflush - unmap kernel VM area - * @addr: start of the VM area to unmap - * @size: size of the VM area to unmap - * - * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size - * specify should have been allocated using get_vm_area() and its - * friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is - * responsible for calling flush_cache_vunmap() on to-be-mapped areas - * before calling this function and flush_tlb_kernel_range() after. - */ -void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) -{ - vunmap_page_range(addr, addr + size); -} - /** * unmap_kernel_range - unmap kernel VM area and flush cache and TLB * @addr: start of the VM area to unmap @@ -2042,7 +2024,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) unsigned long end = addr + size; flush_cache_vunmap(addr, end); - vunmap_page_range(addr, end); + unmap_kernel_range_noflush(addr, size); flush_tlb_kernel_range(addr, end); } -- cgit v1.2.3 From a29adb6209cead1f6c34a8d72481fb183bfc2d68 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:11 -0700 Subject: mm: rename vmap_page_range to map_kernel_range This matches the map_kernel_range_noflush API. Also change to pass a size instead of the end, similar to the noflush version. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-15-hch@lst.de Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e9970849a103..fe8b7aa33094 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -273,13 +273,13 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size, return nr; } -static int vmap_page_range(unsigned long start, unsigned long end, +static int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages) { int ret; - ret = map_kernel_range_noflush(start, end - start, prot, pages); - flush_cache_vmap(start, end); + ret = map_kernel_range_noflush(start, size, prot, pages); + flush_cache_vmap(start, start + size); return ret; } @@ -1867,7 +1867,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro kasan_unpoison_vmalloc(mem, size); - if (vmap_page_range(addr, addr + size, prot, pages) < 0) { + if (map_kernel_range(addr, size, prot, pages) < 0) { vm_unmap_ram(mem, count); return NULL; } @@ -2031,10 +2031,9 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) { unsigned long addr = (unsigned long)area->addr; - unsigned long end = addr + get_vm_area_size(area); int err; - err = vmap_page_range(addr, end, prot, pages); + err = map_kernel_range(addr, get_vm_area_size(area), prot, pages); return err > 0 ? 0 : err; } -- cgit v1.2.3 From 60bb44652a0dcc44acfc2ed8ebb35e4a389e5421 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:15 -0700 Subject: mm: don't return the number of pages from map_kernel_range{,_noflush} None of the callers needs the number of pages, and a 0 / -errno return value is a lot more intuitive. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-16-hch@lst.de Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fe8b7aa33094..aab00ddee686 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -250,7 +250,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, * function. * * RETURNS: - * The number of pages mapped on success, -errno on failure. + * 0 on success, -errno on failure. */ int map_kernel_range_noflush(unsigned long addr, unsigned long size, pgprot_t prot, struct page **pages) @@ -270,7 +270,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size, return err; } while (pgd++, addr = next, addr != end); - return nr; + return 0; } static int map_kernel_range(unsigned long start, unsigned long size, -- cgit v1.2.3 From ed1f324c5fed06c91f30a36aedb66f34244ab86e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:19 -0700 Subject: mm: remove map_vm_range Switch all callers to map_kernel_range, which symmetric to the unmap side (as well as the _noflush versions). Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-17-hch@lst.de Signed-off-by: Linus Torvalds --- Documentation/core-api/cachetlb.rst | 2 +- include/linux/vmalloc.h | 10 ++++------ mm/vmalloc.c | 21 +++++++-------------- mm/zsmalloc.c | 4 +++- net/ceph/ceph_common.c | 3 +-- 5 files changed, 16 insertions(+), 24 deletions(-) diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst index 93cb65d52720..a1582cc79f0f 100644 --- a/Documentation/core-api/cachetlb.rst +++ b/Documentation/core-api/cachetlb.rst @@ -213,7 +213,7 @@ Here are the routines, one by one: there will be no entries in the cache for the kernel address space for virtual addresses in the range 'start' to 'end-1'. - The first of these two routines is invoked after map_vm_area() + The first of these two routines is invoked after map_kernel_range() has installed the page table entries. The second is invoked before unmap_kernel_range() deletes the page table entries. diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index ff69d1e037ca..a1e9bdc3ad9e 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -168,11 +168,11 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size, extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); -extern int map_vm_area(struct vm_struct *area, pgprot_t prot, - struct page **pages); #ifdef CONFIG_MMU extern int map_kernel_range_noflush(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages); +int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, + struct page **pages); extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); extern void unmap_kernel_range(unsigned long addr, unsigned long size); static inline void set_vm_flush_reset_perms(void *addr) @@ -189,14 +189,12 @@ map_kernel_range_noflush(unsigned long start, unsigned long size, { return size >> PAGE_SHIFT; } +#define map_kernel_range map_kernel_range_noflush static inline void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) { } -static inline void -unmap_kernel_range(unsigned long addr, unsigned long size) -{ -} +#define unmap_kernel_range unmap_kernel_range_noflush static inline void set_vm_flush_reset_perms(void *addr) { } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index aab00ddee686..49ca687d8853 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -273,8 +273,8 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size, return 0; } -static int map_kernel_range(unsigned long start, unsigned long size, - pgprot_t prot, struct page **pages) +int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, + struct page **pages) { int ret; @@ -2028,16 +2028,6 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) flush_tlb_kernel_range(addr, end); } -int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) -{ - unsigned long addr = (unsigned long)area->addr; - int err; - - err = map_kernel_range(addr, get_vm_area_size(area), prot, pages); - - return err > 0 ? 0 : err; -} - static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { @@ -2409,7 +2399,8 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_vm_area(area, prot, pages)) { + if (map_kernel_range((unsigned long)area->addr, size, prot, + pages) < 0) { vunmap(area->addr); return NULL; } @@ -2472,8 +2463,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (map_vm_area(area, prot, pages)) + if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), + prot, pages) < 0) goto fail; + return area->addr; fail: diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index ac0524330b9b..f6dc0673e62c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1138,7 +1138,9 @@ static inline void __zs_cpu_down(struct mapping_area *area) static inline void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); + unsigned long addr = (unsigned long)area->vm->addr; + + BUG_ON(map_kernel_range(addr, PAGE_SIZE * 2, PAGE_KERNEL, pages) < 0); area->vm_addr = area->vm->addr; return area->vm_addr + off; } diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index a0e97f6c1072..66f22e8aa529 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -190,8 +190,7 @@ EXPORT_SYMBOL(ceph_compare_options); * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are * compatible with (a superset of) GFP_KERNEL. This is because while the * actual pages are allocated with the specified flags, the page table pages - * are always allocated with GFP_KERNEL. map_vm_area() doesn't even take - * flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc(). + * are always allocated with GFP_KERNEL. * * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO. */ -- cgit v1.2.3 From 855e57a11966722c67e631ae530c03e251fbcf8a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:23 -0700 Subject: mm: remove unmap_vmap_area This function just has a single caller, open code it there. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-18-hch@lst.de Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 49ca687d8853..3b9b1366baa3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1248,14 +1248,6 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); -/* - * Clear the pagetable entries of a given vmap_area - */ -static void unmap_vmap_area(struct vmap_area *va) -{ - unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); -} - /* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. @@ -1417,7 +1409,7 @@ static void free_vmap_area_noflush(struct vmap_area *va) static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); - unmap_vmap_area(va); + unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); -- cgit v1.2.3 From d4efd79a81abc7096a418ee3103f261cfb6ab634 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:27 -0700 Subject: mm: remove the prot argument from vm_map_ram This is always PAGE_KERNEL - for long term mappings with other properties vmap should be used. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-19-hch@lst.de Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c | 2 +- drivers/media/common/videobuf2/videobuf2-dma-sg.c | 3 +-- drivers/media/common/videobuf2/videobuf2-vmalloc.c | 3 +-- fs/erofs/decompressor.c | 2 +- fs/xfs/xfs_buf.c | 2 +- include/linux/vmalloc.h | 3 +-- mm/nommu.c | 2 +- mm/vmalloc.c | 4 ++-- 8 files changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c index 9272bef57092..debaf7b18ab5 100644 --- a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c +++ b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c @@ -66,7 +66,7 @@ static void *mock_dmabuf_vmap(struct dma_buf *dma_buf) { struct mock_dmabuf *mock = to_mock(dma_buf); - return vm_map_ram(mock->pages, mock->npages, 0, PAGE_KERNEL); + return vm_map_ram(mock->pages, mock->npages, 0); } static void mock_dmabuf_vunmap(struct dma_buf *dma_buf, void *vaddr) diff --git a/drivers/media/common/videobuf2/videobuf2-dma-sg.c b/drivers/media/common/videobuf2/videobuf2-dma-sg.c index 6db60e9d5183..92072a08af25 100644 --- a/drivers/media/common/videobuf2/videobuf2-dma-sg.c +++ b/drivers/media/common/videobuf2/videobuf2-dma-sg.c @@ -309,8 +309,7 @@ static void *vb2_dma_sg_vaddr(void *buf_priv) if (buf->db_attach) buf->vaddr = dma_buf_vmap(buf->db_attach->dmabuf); else - buf->vaddr = vm_map_ram(buf->pages, - buf->num_pages, -1, PAGE_KERNEL); + buf->vaddr = vm_map_ram(buf->pages, buf->num_pages, -1); } /* add offset in case userptr is not page-aligned */ diff --git a/drivers/media/common/videobuf2/videobuf2-vmalloc.c b/drivers/media/common/videobuf2/videobuf2-vmalloc.c index 1a4f0ca87c7c..c66fda4a65e4 100644 --- a/drivers/media/common/videobuf2/videobuf2-vmalloc.c +++ b/drivers/media/common/videobuf2/videobuf2-vmalloc.c @@ -107,8 +107,7 @@ static void *vb2_vmalloc_get_userptr(struct device *dev, unsigned long vaddr, buf->vaddr = (__force void *) ioremap(__pfn_to_phys(nums[0]), size + offset); } else { - buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1, - PAGE_KERNEL); + buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1); } if (!buf->vaddr) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 5d2d81940679..7628816f2453 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -274,7 +274,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, i = 0; while (1) { - dst = vm_map_ram(rq->out, nrpages_out, -1, PAGE_KERNEL); + dst = vm_map_ram(rq->out, nrpages_out, -1); /* retry two more times (totally 3 times) */ if (dst || ++i >= 3) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 9ec3eaf1c618..65538d18e64f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -477,7 +477,7 @@ _xfs_buf_map_pages( nofs_flag = memalloc_nofs_save(); do { bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, - -1, PAGE_KERNEL); + -1); if (bp->b_addr) break; vm_unmap_aliases(); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index a1e9bdc3ad9e..5488cea5ef11 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -88,8 +88,7 @@ struct vmap_area { * Highlevel APIs for driver use */ extern void vm_unmap_ram(const void *mem, unsigned int count); -extern void *vm_map_ram(struct page **pages, unsigned int count, - int node, pgprot_t prot); +extern void *vm_map_ram(struct page **pages, unsigned int count, int node); extern void vm_unmap_aliases(void); #ifdef CONFIG_MMU diff --git a/mm/nommu.c b/mm/nommu.c index 318df4e236c9..4f07b7ef0297 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -351,7 +351,7 @@ void vunmap(const void *addr) } EXPORT_SYMBOL(vunmap); -void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +void *vm_map_ram(struct page **pages, unsigned int count, int node) { BUG(); return NULL; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3b9b1366baa3..9848156a1c6a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1835,7 +1835,7 @@ EXPORT_SYMBOL(vm_unmap_ram); * * Returns: a pointer to the address that has been mapped, or %NULL on failure */ -void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +void *vm_map_ram(struct page **pages, unsigned int count, int node) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr; @@ -1859,7 +1859,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro kasan_unpoison_vmalloc(mem, size); - if (map_kernel_range(addr, size, prot, pages) < 0) { + if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) { vm_unmap_ram(mem, count); return NULL; } -- cgit v1.2.3 From cca98e9f8b5ebcd9640846a675172578249b11a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:32 -0700 Subject: mm: enforce that vmap can't map pages executable To help enforcing the W^X protection don't allow remapping existing pages as executable. x86 bits from Peter Zijlstra, arm64 bits from Mark Rutland. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Mark Rutland . Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-20-hch@lst.de Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/pgtable.h | 3 +++ arch/x86/include/asm/pgtable_types.h | 6 ++++++ include/asm-generic/pgtable.h | 4 ++++ mm/vmalloc.c | 2 +- 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 538c85e62f86..47095216d6a8 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -407,6 +407,9 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) #define __pgprot_modify(prot,mask,bits) \ __pgprot((pgprot_val(prot) & ~(mask)) | (bits)) +#define pgprot_nx(prot) \ + __pgprot_modify(prot, 0, PTE_PXN) + /* * Mark the prot value as uncacheable and unbufferable. */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 947867f112ea..2e7c442cc618 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -282,6 +282,12 @@ typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; typedef struct { pgdval_t pgd; } pgd_t; +static inline pgprot_t pgprot_nx(pgprot_t prot) +{ + return __pgprot(pgprot_val(prot) | _PAGE_NX); +} +#define pgprot_nx pgprot_nx + #ifdef CONFIG_X86_PAE /* diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 329b8c8ca703..8c5f9c29698b 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -491,6 +491,10 @@ static inline int arch_unmap_one(struct mm_struct *mm, #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) #endif +#ifndef pgprot_nx +#define pgprot_nx(prot) (prot) +#endif + #ifndef pgprot_noncached #define pgprot_noncached(prot) (prot) #endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9848156a1c6a..4d7c7108181a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2391,7 +2391,7 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_kernel_range((unsigned long)area->addr, size, prot, + if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot), pages) < 0) { vunmap(area->addr); return NULL; -- cgit v1.2.3 From d28ff991b2e23049c30584c1d97681087193879f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:36 -0700 Subject: gpu/drm: remove the powerpc hack in drm_legacy_sg_alloc The non-cached vmalloc mapping was initially added as a hack for the first-gen amigaone platform (6xx/book32s), isn't fully supported upstream, and which used the legacy radeon driver together with non-coherent DMA. However this only ever worked reliably for DRI . Remove the hack as it is the last user of __vmalloc passing a page protection flag other than PAGE_KERNEL and didn't do anything for other platforms with non-coherent DMA. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Daniel Vetter Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-21-hch@lst.de Signed-off-by: Linus Torvalds --- drivers/gpu/drm/drm_scatter.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/gpu/drm/drm_scatter.c b/drivers/gpu/drm/drm_scatter.c index ca520028b2cb..f4e6184d1877 100644 --- a/drivers/gpu/drm/drm_scatter.c +++ b/drivers/gpu/drm/drm_scatter.c @@ -43,15 +43,6 @@ #define DEBUG_SCATTER 0 -static inline void *drm_vmalloc_dma(unsigned long size) -{ -#if defined(__powerpc__) && defined(CONFIG_NOT_COHERENT_CACHE) - return __vmalloc(size, GFP_KERNEL, pgprot_noncached_wc(PAGE_KERNEL)); -#else - return vmalloc_32(size); -#endif -} - static void drm_sg_cleanup(struct drm_sg_mem * entry) { struct page *page; @@ -126,7 +117,7 @@ int drm_legacy_sg_alloc(struct drm_device *dev, void *data, return -ENOMEM; } - entry->virtual = drm_vmalloc_dma(pages << PAGE_SHIFT); + entry->virtual = vmalloc_32(pages << PAGE_SHIFT); if (!entry->virtual) { kfree(entry->busaddr); kfree(entry->pagelist); -- cgit v1.2.3 From 88dca4ca5a93d2c09e5bbc6a62fbfc3af83c4fca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:40 -0700 Subject: mm: remove the pgprot argument to __vmalloc The pgprot argument to __vmalloc is always PAGE_KERNEL now, so remove it. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Michael Kelley [hyperv] Acked-by: Gao Xiang [erofs] Acked-by: Peter Zijlstra (Intel) Acked-by: Wei Liu Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-22-hch@lst.de Signed-off-by: Linus Torvalds --- arch/x86/hyperv/hv_init.c | 3 +-- arch/x86/include/asm/kvm_host.h | 3 +-- arch/x86/kvm/svm/sev.c | 3 +-- drivers/block/drbd/drbd_bitmap.c | 4 +--- drivers/gpu/drm/etnaviv/etnaviv_dump.c | 4 ++-- drivers/lightnvm/pblk-init.c | 5 ++--- drivers/md/dm-bufio.c | 4 ++-- drivers/mtd/ubi/io.c | 4 ++-- drivers/scsi/sd_zbc.c | 3 +-- fs/gfs2/dir.c | 9 ++++----- fs/gfs2/quota.c | 2 +- fs/nfs/blocklayout/extent_tree.c | 2 +- fs/ntfs/malloc.h | 2 +- fs/ubifs/debug.c | 2 +- fs/ubifs/lprops.c | 2 +- fs/ubifs/lpt_commit.c | 4 ++-- fs/ubifs/orphan.c | 2 +- fs/xfs/kmem.c | 2 +- include/linux/vmalloc.h | 2 +- kernel/bpf/core.c | 6 +++--- kernel/groups.c | 2 +- kernel/module.c | 3 +-- mm/nommu.c | 15 +++++++-------- mm/page_alloc.c | 2 +- mm/percpu.c | 2 +- mm/vmalloc.c | 4 ++-- net/bridge/netfilter/ebtables.c | 6 ++---- sound/core/memalloc.c | 2 +- sound/core/pcm_memory.c | 2 +- 29 files changed, 47 insertions(+), 59 deletions(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 697ddd2afef9..e2137070386a 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -97,8 +97,7 @@ static int hv_cpu_init(unsigned int cpu) * not be stopped in the case of CPU offlining and the VM will hang. */ if (!*hvp) { - *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL); + *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); } if (*hvp) { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0a6b35353fc7..e94b3de564d6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1279,8 +1279,7 @@ extern struct kmem_cache *x86_fpu_cache; #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { - return __vmalloc(kvm_x86_ops.vm_size, - GFP_KERNEL_ACCOUNT | __GFP_ZERO, PAGE_KERNEL); + return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); } void kvm_arch_free_vm(struct kvm *kvm); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 89f7f3aebd31..5573a97f1520 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -336,8 +336,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); if (size > PAGE_SIZE) - pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO, - PAGE_KERNEL); + pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); else pages = kmalloc(size, GFP_KERNEL_ACCOUNT); diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 15e99697234a..df53dca5d02c 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -396,9 +396,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) bytes = sizeof(struct page *)*want; new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN); if (!new_pages) { - new_pages = __vmalloc(bytes, - GFP_NOIO | __GFP_ZERO, - PAGE_KERNEL); + new_pages = __vmalloc(bytes, GFP_NOIO | __GFP_ZERO); if (!new_pages) return NULL; } diff --git a/drivers/gpu/drm/etnaviv/etnaviv_dump.c b/drivers/gpu/drm/etnaviv/etnaviv_dump.c index 648cf0207309..706af0304ca4 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_dump.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_dump.c @@ -154,8 +154,8 @@ void etnaviv_core_dump(struct etnaviv_gem_submit *submit) file_size += sizeof(*iter.hdr) * n_obj; /* Allocate the file in vmalloc memory, it's likely to be big */ - iter.start = __vmalloc(file_size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, - PAGE_KERNEL); + iter.start = __vmalloc(file_size, GFP_KERNEL | __GFP_NOWARN | + __GFP_NORETRY); if (!iter.start) { mutex_unlock(&gpu->mmu_context->lock); dev_warn(gpu->dev, "failed to allocate devcoredump file\n"); diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 9a967a2e83dd..6e677ff62cc9 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -145,9 +145,8 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init) int ret = 0; map_size = pblk_trans_map_size(pblk); - pblk->trans_map = __vmalloc(map_size, GFP_KERNEL | __GFP_NOWARN - | __GFP_RETRY_MAYFAIL | __GFP_HIGHMEM, - PAGE_KERNEL); + pblk->trans_map = __vmalloc(map_size, GFP_KERNEL | __GFP_NOWARN | + __GFP_RETRY_MAYFAIL | __GFP_HIGHMEM); if (!pblk->trans_map) { pblk_err(pblk, "failed to allocate L2P (need %zu of memory)\n", map_size); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 2d519c223562..d1786cfd7f22 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -400,13 +400,13 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, */ if (gfp_mask & __GFP_NORETRY) { unsigned noio_flag = memalloc_noio_save(); - void *ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); + void *ptr = __vmalloc(c->block_size, gfp_mask); memalloc_noio_restore(noio_flag); return ptr; } - return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); + return __vmalloc(c->block_size, gfp_mask); } /* diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c index b57b84fb97d0..14d890b00d2c 100644 --- a/drivers/mtd/ubi/io.c +++ b/drivers/mtd/ubi/io.c @@ -1297,7 +1297,7 @@ static int self_check_write(struct ubi_device *ubi, const void *buf, int pnum, if (!ubi_dbg_chk_io(ubi)) return 0; - buf1 = __vmalloc(len, GFP_NOFS, PAGE_KERNEL); + buf1 = __vmalloc(len, GFP_NOFS); if (!buf1) { ubi_err(ubi, "cannot allocate memory to check writes"); return 0; @@ -1361,7 +1361,7 @@ int ubi_self_check_all_ff(struct ubi_device *ubi, int pnum, int offset, int len) if (!ubi_dbg_chk_io(ubi)) return 0; - buf = __vmalloc(len, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(len, GFP_NOFS); if (!buf) { ubi_err(ubi, "cannot allocate memory to check for 0xFFs"); return 0; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index f45c22b09726..8be27426aa66 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -136,8 +136,7 @@ static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp, while (bufsize >= SECTOR_SIZE) { buf = __vmalloc(bufsize, - GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY, - PAGE_KERNEL); + GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY); if (buf) { *buflen = bufsize; return buf; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index c3f7732415be..c0f2875c946c 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -354,7 +354,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN); if (hc == NULL) - hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL); + hc = __vmalloc(hsize, GFP_NOFS); if (hc == NULL) return ERR_PTR(-ENOMEM); @@ -1166,7 +1166,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) hc2 = kmalloc_array(hsize_bytes, 2, GFP_NOFS | __GFP_NOWARN); if (hc2 == NULL) - hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL); + hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS); if (!hc2) return -ENOMEM; @@ -1327,7 +1327,7 @@ static void *gfs2_alloc_sort_buffer(unsigned size) if (size < KMALLOC_MAX_SIZE) ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN); if (!ptr) - ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL); + ptr = __vmalloc(size, GFP_NOFS); return ptr; } @@ -1987,8 +1987,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN); if (ht == NULL) - ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO, - PAGE_KERNEL); + ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO); if (!ht) return -ENOMEM; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 8259fef3f986..4b67d47a7e00 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1365,7 +1365,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN); if (sdp->sd_quota_bitmap == NULL) sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS | - __GFP_ZERO, PAGE_KERNEL); + __GFP_ZERO); if (!sdp->sd_quota_bitmap) return error; diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 7a57ff2528af..8f7cff7a4293 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -582,7 +582,7 @@ retry: if (!arg->layoutupdate_pages) return -ENOMEM; - start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL); + start_p = __vmalloc(buffer_size, GFP_NOFS); if (!start_p) { kfree(arg->layoutupdate_pages); return -ENOMEM; diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index 842b0bfc3ac9..7068425735f1 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h @@ -34,7 +34,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) /* return (void *)__get_free_page(gfp_mask); */ } if (likely((size >> PAGE_SHIFT) < totalram_pages())) - return __vmalloc(size, gfp_mask, PAGE_KERNEL); + return __vmalloc(size, gfp_mask); return NULL; } diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 0f5a480fe264..31288d8fa2ce 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -815,7 +815,7 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum) pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory for dumping LEB %d", lnum); return; diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 29826c51883a..22bfda158f7f 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1095,7 +1095,7 @@ static int scan_check_cb(struct ubifs_info *c, return LPT_SCAN_CONTINUE; } - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) return -ENOMEM; diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index ff5e0411cf2d..d76a19e460cd 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1596,7 +1596,7 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) if (!dbg_is_chk_lprops(c)) return 0; - buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = p = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory for ltab checking"); return 0; @@ -1845,7 +1845,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) void *buf, *p; pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); - buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = p = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory to dump LPT"); return; diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 283f9eb48410..2c294085ffed 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -977,7 +977,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) if (c->no_orphs) return 0; - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory to check orphans"); return 0; diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 1da94237a8cf..f1366475c389 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -48,7 +48,7 @@ __kmem_vmalloc(size_t size, xfs_km_flags_t flags) if (flags & KM_NOFS) nofs_flag = memalloc_nofs_save(); - ptr = __vmalloc(size, lflags, PAGE_KERNEL); + ptr = __vmalloc(size, lflags); if (flags & KM_NOFS) memalloc_nofs_restore(nofs_flag); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 5488cea5ef11..1c278e030599 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -110,7 +110,7 @@ extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags); extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); -extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); +extern void *__vmalloc(unsigned long size, gfp_t gfp_mask); extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 14aa1f74dd10..cf6fe9107f5c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -82,7 +82,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag struct bpf_prog *fp; size = round_up(size, PAGE_SIZE); - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) return NULL; @@ -232,7 +232,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, if (ret) return NULL; - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) { __bpf_prog_uncharge(fp_old->aux->user, delta); } else { @@ -1089,7 +1089,7 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; - fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags); if (fp != NULL) { /* aux->prog still points to the fp_other one, so * when promoting the clone to the real program, diff --git a/kernel/groups.c b/kernel/groups.c index daae2f2dc6d4..6ee6691f6839 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -20,7 +20,7 @@ struct group_info *groups_alloc(int gidsetsize) len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); if (!gi) - gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); + gi = __vmalloc(len, GFP_KERNEL_ACCOUNT); if (!gi) return NULL; diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..086618a0058f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2946,8 +2946,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return err; /* Suck in entire file: we'll want most of it. */ - info->hdr = __vmalloc(info->len, - GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL); + info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN); if (!info->hdr) return -ENOMEM; diff --git a/mm/nommu.c b/mm/nommu.c index 4f07b7ef0297..2df549adb22b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -140,7 +140,7 @@ void vfree(const void *addr) } EXPORT_SYMBOL(vfree); -void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +void *__vmalloc(unsigned long size, gfp_t gfp_mask) { /* * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() @@ -152,14 +152,14 @@ EXPORT_SYMBOL(__vmalloc); void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) { - return __vmalloc(size, flags, PAGE_KERNEL); + return __vmalloc(size, flags); } static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) { void *ret; - ret = __vmalloc(size, flags, PAGE_KERNEL); + ret = __vmalloc(size, flags); if (ret) { struct vm_area_struct *vma; @@ -230,7 +230,7 @@ long vwrite(char *buf, char *addr, unsigned long count) */ void *vmalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); } EXPORT_SYMBOL(vmalloc); @@ -248,8 +248,7 @@ EXPORT_SYMBOL(vmalloc); */ void *vzalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc); @@ -302,7 +301,7 @@ EXPORT_SYMBOL(vzalloc_node); void *vmalloc_exec(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); } /** @@ -314,7 +313,7 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL); } EXPORT_SYMBOL(vmalloc_32); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cc406ee17ad9..45ad73122e82 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8244,7 +8244,7 @@ void *__init alloc_large_system_hash(const char *tablename, table = memblock_alloc_raw(size, SMP_CACHE_BYTES); } else if (get_order(size) >= MAX_ORDER || hashdist) { - table = __vmalloc(size, gfp_flags, PAGE_KERNEL); + table = __vmalloc(size, gfp_flags); virt = true; } else { /* diff --git a/mm/percpu.c b/mm/percpu.c index 7da7d7737dab..696367b18222 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -482,7 +482,7 @@ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) if (size <= PAGE_SIZE) return kzalloc(size, gfp); else - return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL); + return __vmalloc(size, gfp | __GFP_ZERO); } /** diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4d7c7108181a..11194ae18f23 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2564,9 +2564,9 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_mask, prot, 0, node, caller); } -void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +void *__vmalloc(unsigned long size, gfp_t gfp_mask) { - return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE, + return __vmalloc_node(size, 1, gfp_mask, PAGE_KERNEL, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 78db58c7aec2..7e869284e052 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1095,16 +1095,14 @@ static int do_replace(struct net *net, const void __user *user, tmp.name[sizeof(tmp.name) - 1] = 0; countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids; - newinfo = __vmalloc(sizeof(*newinfo) + countersize, GFP_KERNEL_ACCOUNT, - PAGE_KERNEL); + newinfo = __vmalloc(sizeof(*newinfo) + countersize, GFP_KERNEL_ACCOUNT); if (!newinfo) return -ENOMEM; if (countersize) memset(newinfo->counters, 0, countersize); - newinfo->entries = __vmalloc(tmp.entries_size, GFP_KERNEL_ACCOUNT, - PAGE_KERNEL); + newinfo->entries = __vmalloc(tmp.entries_size, GFP_KERNEL_ACCOUNT); if (!newinfo->entries) { ret = -ENOMEM; goto free_newinfo; diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c index a83553fbedf0..bea46ed157a6 100644 --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -143,7 +143,7 @@ int snd_dma_alloc_pages(int type, struct device *device, size_t size, break; case SNDRV_DMA_TYPE_VMALLOC: gfp = snd_mem_get_gfp_flags(device, GFP_KERNEL | __GFP_HIGHMEM); - dmab->area = __vmalloc(size, gfp, PAGE_KERNEL); + dmab->area = __vmalloc(size, gfp); dmab->addr = 0; break; #ifdef CONFIG_HAS_DMA diff --git a/sound/core/pcm_memory.c b/sound/core/pcm_memory.c index fcab37ea6641..860935e3aea4 100644 --- a/sound/core/pcm_memory.c +++ b/sound/core/pcm_memory.c @@ -460,7 +460,7 @@ int _snd_pcm_lib_alloc_vmalloc_buffer(struct snd_pcm_substream *substream, return 0; /* already large enough */ vfree(runtime->dma_area); } - runtime->dma_area = __vmalloc(size, gfp_flags, PAGE_KERNEL); + runtime->dma_area = __vmalloc(size, gfp_flags); if (!runtime->dma_area) return -ENOMEM; runtime->dma_bytes = size; -- cgit v1.2.3 From f38fcb9c1c5e9df20bf6d8aa9e9f36eb98d78c9f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:45 -0700 Subject: mm: remove the prot argument to __vmalloc_node This is always PAGE_KERNEL now. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-23-hch@lst.de Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 11194ae18f23..c9343f1a7268 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2402,8 +2402,7 @@ void *vmap(struct page **pages, unsigned int count, EXPORT_SYMBOL(vmap); static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, const void *caller); + gfp_t gfp_mask, int node, const void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node) { @@ -2421,7 +2420,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, - PAGE_KERNEL, node, area->caller); + node, area->caller); } else { pages = kmalloc_node(array_size, nested_gfp, node); } @@ -2540,13 +2539,11 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_range); * @size: allocation size * @align: desired alignment * @gfp_mask: flags for the page level allocator - * @prot: protection mask for the allocated pages * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * - * Allocate enough pages to cover @size from the page level - * allocator with @gfp_mask flags. Map them into contiguous - * kernel virtual space, using a pagetable protection of @prot. + * Allocate enough pages to cover @size from the page level allocator with + * @gfp_mask flags. Map them into contiguous kernel virtual space. * * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL * and __GFP_NOFAIL are not supported @@ -2557,16 +2554,15 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_range); * Return: pointer to the allocated memory or %NULL on error */ static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, const void *caller) + gfp_t gfp_mask, int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, - gfp_mask, prot, 0, node, caller); + gfp_mask, PAGE_KERNEL, 0, node, caller); } void *__vmalloc(unsigned long size, gfp_t gfp_mask) { - return __vmalloc_node(size, 1, gfp_mask, PAGE_KERNEL, NUMA_NO_NODE, + return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc); @@ -2574,15 +2570,15 @@ EXPORT_SYMBOL(__vmalloc); static inline void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) { - return __vmalloc_node(size, 1, flags, PAGE_KERNEL, - node, __builtin_return_address(0)); + return __vmalloc_node(size, 1, flags, node, + __builtin_return_address(0)); } void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, void *caller) { - return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller); + return __vmalloc_node(size, 1, flags, node, caller); } /** @@ -2657,8 +2653,8 @@ EXPORT_SYMBOL(vmalloc_user); */ void *vmalloc_node(unsigned long size, int node) { - return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, - node, __builtin_return_address(0)); + return __vmalloc_node(size, 1, GFP_KERNEL, node, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node); @@ -2671,9 +2667,6 @@ EXPORT_SYMBOL(vmalloc_node); * allocator and map them into contiguous kernel virtual space. * The memory allocated is set to zero. * - * For tight control over page level allocator and protection flags - * use __vmalloc_node() instead. - * * Return: pointer to the allocated memory or %NULL on error */ void *vzalloc_node(unsigned long size, int node) @@ -2746,8 +2739,8 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, - NUMA_NO_NODE, __builtin_return_address(0)); + return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32); -- cgit v1.2.3 From 4d39d7285f45cc6c72b850f040d3addd626658e4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:49 -0700 Subject: mm: remove both instances of __vmalloc_node_flags The real version just had a few callers that can open code it and remove one layer of indirection. The nommu stub was public but only had a single caller, so remove it and avoid a CONFIG_MMU ifdef in vmalloc.h. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-24-hch@lst.de Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 9 --------- mm/nommu.c | 3 ++- mm/vmalloc.c | 20 ++++++-------------- 3 files changed, 8 insertions(+), 24 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 1c278e030599..91d143783458 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -115,17 +115,8 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller); -#ifndef CONFIG_MMU -extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); -static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, - gfp_t flags, void *caller) -{ - return __vmalloc_node_flags(size, node, flags); -} -#else extern void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, void *caller); -#endif extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); diff --git a/mm/nommu.c b/mm/nommu.c index 2df549adb22b..9553efa59787 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -150,7 +150,8 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask) } EXPORT_SYMBOL(__vmalloc); -void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) +void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, + void *caller) { return __vmalloc(size, flags); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c9343f1a7268..e9f730092c81 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2567,14 +2567,6 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask) } EXPORT_SYMBOL(__vmalloc); -static inline void *__vmalloc_node_flags(unsigned long size, - int node, gfp_t flags) -{ - return __vmalloc_node(size, 1, flags, node, - __builtin_return_address(0)); -} - - void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, void *caller) { @@ -2595,8 +2587,8 @@ void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, */ void *vmalloc(unsigned long size) { - return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL); + return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc); @@ -2615,8 +2607,8 @@ EXPORT_SYMBOL(vmalloc); */ void *vzalloc(unsigned long size) { - return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL | __GFP_ZERO); + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, + __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc); @@ -2671,8 +2663,8 @@ EXPORT_SYMBOL(vmalloc_node); */ void *vzalloc_node(unsigned long size, int node) { - return __vmalloc_node_flags(size, node, - GFP_KERNEL | __GFP_ZERO); + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, + __builtin_return_address(0)); } EXPORT_SYMBOL(vzalloc_node); -- cgit v1.2.3 From 2b9059489c839e67ca9254913325e18cea11a980 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:53 -0700 Subject: mm: remove __vmalloc_node_flags_caller Just use __vmalloc_node instead which gets and extra argument. To be able to to use __vmalloc_node in all caller make it available outside of vmalloc and implement it in nommu.c. [akpm@linux-foundation.org: fix nommu build] Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Cc: Stephen Rothwell Link: http://lkml.kernel.org/r/20200414131348.444715-25-hch@lst.de Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 4 ++-- kernel/bpf/syscall.c | 5 ++--- mm/nommu.c | 6 +++--- mm/util.c | 2 +- mm/vmalloc.c | 10 +--------- 5 files changed, 9 insertions(+), 18 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 91d143783458..964d58a060ca 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -115,8 +115,8 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller); -extern void *__vmalloc_node_flags_caller(unsigned long size, - int node, gfp_t flags, void *caller); +void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, + int node, const void *caller); extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4e6dee19a668..9c1cf7a87fb3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -299,9 +299,8 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags); } - return __vmalloc_node_flags_caller(size, numa_node, - GFP_KERNEL | __GFP_RETRY_MAYFAIL | - flags, __builtin_return_address(0)); + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags, + numa_node, __builtin_return_address(0)); } void *bpf_map_area_alloc(u64 size, int numa_node) diff --git a/mm/nommu.c b/mm/nommu.c index 9553efa59787..08848fc18672 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -150,10 +150,10 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask) } EXPORT_SYMBOL(__vmalloc); -void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, - void *caller) +void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, + int node, const void *caller) { - return __vmalloc(size, flags); + return __vmalloc(size, gfp_mask); } static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) diff --git a/mm/util.c b/mm/util.c index 988d11e6c17c..6d5868adbe18 100644 --- a/mm/util.c +++ b/mm/util.c @@ -580,7 +580,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (ret || size <= PAGE_SIZE) return ret; - return __vmalloc_node_flags_caller(size, node, flags, + return __vmalloc_node(size, 1, flags, node, __builtin_return_address(0)); } EXPORT_SYMBOL(kvmalloc_node); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e9f730092c81..88f9971a7d6e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2401,8 +2401,6 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap); -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, int node, const void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node) { @@ -2553,7 +2551,7 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_range); * * Return: pointer to the allocated memory or %NULL on error */ -static void *__vmalloc_node(unsigned long size, unsigned long align, +void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, @@ -2567,12 +2565,6 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask) } EXPORT_SYMBOL(__vmalloc); -void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, - void *caller) -{ - return __vmalloc_node(size, 1, flags, node, caller); -} - /** * vmalloc - allocate virtually contiguous memory * @size: allocation size -- cgit v1.2.3 From c3f896dcf1e47959aca4f8e6ac9537b478949126 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:57 -0700 Subject: mm: switch the test_vmalloc module to use __vmalloc_node No need to export the very low-level __vmalloc_node_range when the test module can use a slightly higher level variant. [akpm@linux-foundation.org: add missing `node' arg] [akpm@linux-foundation.org: fix riscv nommu build] Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-26-hch@lst.de Signed-off-by: Linus Torvalds --- arch/riscv/include/asm/pgtable.h | 4 ++-- lib/test_vmalloc.c | 26 +++++++------------------- mm/vmalloc.c | 17 ++++++++--------- 3 files changed, 17 insertions(+), 30 deletions(-) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 35b60035b6b0..d50706ea1c94 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -473,9 +473,9 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, #define PAGE_SHARED __pgprot(0) #define PAGE_KERNEL __pgprot(0) #define swapper_pg_dir NULL +#define TASK_SIZE 0xffffffffUL #define VMALLOC_START 0 - -#define TASK_SIZE 0xffffffffUL +#define VMALLOC_END TASK_SIZE static inline void __kernel_map_pages(struct page *page, int numpages, int enable) {} diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index 8bbefcaddfe8..ddc9685702b1 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -91,12 +91,8 @@ static int random_size_align_alloc_test(void) */ size = ((rnd % 10) + 1) * PAGE_SIZE; - ptr = __vmalloc_node_range(size, align, - VMALLOC_START, VMALLOC_END, - GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL, - 0, 0, __builtin_return_address(0)); - + ptr = __vmalloc_node(size, align, GFP_KERNEL | __GFP_ZERO, 0, + __builtin_return_address(0)); if (!ptr) return -1; @@ -118,12 +114,8 @@ static int align_shift_alloc_test(void) for (i = 0; i < BITS_PER_LONG; i++) { align = ((unsigned long) 1) << i; - ptr = __vmalloc_node_range(PAGE_SIZE, align, - VMALLOC_START, VMALLOC_END, - GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL, - 0, 0, __builtin_return_address(0)); - + ptr = __vmalloc_node(PAGE_SIZE, align, GFP_KERNEL|__GFP_ZERO, 0, + __builtin_return_address(0)); if (!ptr) return -1; @@ -139,13 +131,9 @@ static int fix_align_alloc_test(void) int i; for (i = 0; i < test_loop_count; i++) { - ptr = __vmalloc_node_range(5 * PAGE_SIZE, - THREAD_ALIGN << 1, - VMALLOC_START, VMALLOC_END, - GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL, - 0, 0, __builtin_return_address(0)); - + ptr = __vmalloc_node(5 * PAGE_SIZE, THREAD_ALIGN << 1, + GFP_KERNEL | __GFP_ZERO, 0, + __builtin_return_address(0)); if (!ptr) return -1; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 88f9971a7d6e..931106654d1f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2523,15 +2523,6 @@ fail: return NULL; } -/* - * This is only for performance analysis of vmalloc and stress purpose. - * It is required by vmalloc test module, therefore do not use it other - * than that. - */ -#ifdef CONFIG_TEST_VMALLOC_MODULE -EXPORT_SYMBOL_GPL(__vmalloc_node_range); -#endif - /** * __vmalloc_node - allocate virtually contiguous memory * @size: allocation size @@ -2557,6 +2548,14 @@ void *__vmalloc_node(unsigned long size, unsigned long align, return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, 0, node, caller); } +/* + * This is only for performance analysis of vmalloc and stress purpose. + * It is required by vmalloc test module, therefore do not use it other + * than that. + */ +#ifdef CONFIG_TEST_VMALLOC_MODULE +EXPORT_SYMBOL_GPL(__vmalloc_node); +#endif void *__vmalloc(unsigned long size, gfp_t gfp_mask) { -- cgit v1.2.3 From 041de93ff86fc500aa73e5360039c95f4d31b95f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:52:02 -0700 Subject: mm: remove vmalloc_user_node_flags Open code it in __bpf_map_area_alloc, which is the only caller. Also clean up __bpf_map_area_alloc to have a single vmalloc call with slightly different flags instead of the current two different calls. For this to compile for the nommu case add a __vmalloc_node_range stub to nommu.c. [akpm@linux-foundation.org: fix nommu.c build] Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Cc: Stephen Rothwell Link: http://lkml.kernel.org/r/20200414131348.444715-27-hch@lst.de Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 1 - kernel/bpf/syscall.c | 24 ++++++++++++++---------- mm/nommu.c | 14 ++++++++------ mm/vmalloc.c | 20 -------------------- 4 files changed, 22 insertions(+), 37 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 964d58a060ca..3332926295d4 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -106,7 +106,6 @@ extern void *vzalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); extern void *vzalloc_node(unsigned long size, int node); -extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags); extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9c1cf7a87fb3..42c7a42fc9c8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ @@ -281,26 +282,29 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) * __GFP_RETRY_MAYFAIL to avoid such situations. */ - const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; + const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO; + unsigned int flags = 0; + unsigned long align = 1; void *area; if (size >= SIZE_MAX) return NULL; /* kmalloc()'ed memory can't be mmap()'ed */ - if (!mmapable && size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, + if (mmapable) { + BUG_ON(!PAGE_ALIGNED(size)); + align = SHMLBA; + flags = VM_USERMAP; + } else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + area = kmalloc_node(size, gfp | GFP_USER | __GFP_NORETRY, numa_node); if (area != NULL) return area; } - if (mmapable) { - BUG_ON(!PAGE_ALIGNED(size)); - return vmalloc_user_node_flags(size, numa_node, GFP_KERNEL | - __GFP_RETRY_MAYFAIL | flags); - } - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_RETRY_MAYFAIL | flags, - numa_node, __builtin_return_address(0)); + + return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + gfp | GFP_KERNEL | __GFP_RETRY_MAYFAIL, PAGE_KERNEL, + flags, numa_node, __builtin_return_address(0)); } void *bpf_map_area_alloc(u64 size, int numa_node) diff --git a/mm/nommu.c b/mm/nommu.c index 08848fc18672..371697bf372d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -150,6 +150,14 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask) } EXPORT_SYMBOL(__vmalloc); +void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags, int node, + const void *caller) +{ + return __vmalloc(size, gfp_mask); +} + void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) { @@ -180,12 +188,6 @@ void *vmalloc_user(unsigned long size) } EXPORT_SYMBOL(vmalloc_user); -void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) -{ - return __vmalloc_user_flags(size, flags | __GFP_ZERO); -} -EXPORT_SYMBOL(vmalloc_user_node_flags); - struct page *vmalloc_to_page(const void *addr) { return virt_to_page(addr); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 931106654d1f..410bfe26ee73 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2659,26 +2659,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -/** - * vmalloc_user_node_flags - allocate memory for userspace on a specific node - * @size: allocation size - * @node: numa node - * @flags: flags for the page level allocator - * - * The resulting memory area is zeroed so it can be mapped to userspace - * without leaking data. - * - * Return: pointer to the allocated memory or %NULL on error - */ -void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) -{ - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, - flags | __GFP_ZERO, PAGE_KERNEL, - VM_USERMAP, node, - __builtin_return_address(0)); -} -EXPORT_SYMBOL(vmalloc_user_node_flags); - /** * vmalloc_exec - allocate virtually contiguous, executable memory * @size: allocation size -- cgit v1.2.3 From ebcdd3074a729f9ba351278e1b42d7ae7fcdf236 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:52:06 -0700 Subject: arm64: use __vmalloc_node in arch_alloc_vmap_stack arch_alloc_vmap_stack can use a slightly higher level vmalloc function. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-28-hch@lst.de Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/vmap_stack.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/vmap_stack.h b/arch/arm64/include/asm/vmap_stack.h index 0a12115d9638..0cc6636e3f15 100644 --- a/arch/arm64/include/asm/vmap_stack.h +++ b/arch/arm64/include/asm/vmap_stack.h @@ -19,10 +19,8 @@ static inline unsigned long *arch_alloc_vmap_stack(size_t stack_size, int node) { BUILD_BUG_ON(!IS_ENABLED(CONFIG_VMAP_STACK)); - return __vmalloc_node_range(stack_size, THREAD_ALIGN, - VMALLOC_START, VMALLOC_END, - THREADINFO_GFP, PAGE_KERNEL, 0, node, - __builtin_return_address(0)); + return __vmalloc_node(stack_size, THREAD_ALIGN, THREADINFO_GFP, node, + __builtin_return_address(0)); } #endif /* __ASM_VMAP_STACK_H */ -- cgit v1.2.3 From cb0849a990b2928760bc38561e8d33b554196e25 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:52:10 -0700 Subject: powerpc: use __vmalloc_node in alloc_vm_stack alloc_vm_stack can use a slightly higher level vmalloc function. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-29-hch@lst.de Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/irq.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 1f1169856dc8..112d150354b2 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -748,9 +748,8 @@ void do_IRQ(struct pt_regs *regs) static void *__init alloc_vm_stack(void) { - return __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, VMALLOC_START, - VMALLOC_END, THREADINFO_GFP, PAGE_KERNEL, - 0, NUMA_NO_NODE, (void*)_RET_IP_); + return __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, THREADINFO_GFP, + NUMA_NO_NODE, (void *)_RET_IP_); } static void __init vmap_irqstack_init(void) -- cgit v1.2.3 From b200f5b58c86aa75b8b6952ce9b89013c6660a57 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:52:14 -0700 Subject: s390: use __vmalloc_node in stack_alloc stack_alloc can use a slightly higher level vmalloc function. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Christian Borntraeger Acked-by: Peter Zijlstra (Intel) Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Wei Liu Cc: David Airlie Cc: Laura Abbott Cc: Sumit Semwal Cc: Sakari Ailus Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Christophe Leroy Cc: Daniel Vetter Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Johannes Weiner Cc: Mark Rutland Cc: Michael Kelley Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-30-hch@lst.de Signed-off-by: Linus Torvalds --- arch/s390/kernel/setup.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 36445dd40fdb..0f0b140b5558 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -305,12 +305,9 @@ void *restart_stack __section(.data); unsigned long stack_alloc(void) { #ifdef CONFIG_VMAP_STACK - return (unsigned long) - __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, - VMALLOC_START, VMALLOC_END, - THREADINFO_GFP, - PAGE_KERNEL, 0, NUMA_NO_NODE, - __builtin_return_address(0)); + return (unsigned long)__vmalloc_node(THREAD_SIZE, THREAD_SIZE, + THREADINFO_GFP, NUMA_NO_NODE, + __builtin_return_address(0)); #else return __get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); #endif -- cgit v1.2.3 From d8626138009ba58ae2c22356966c2edaa1f1c3b5 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 1 Jun 2020 21:52:18 -0700 Subject: mm: add functions to track page directory modifications Patch series "mm: Get rid of vmalloc_sync_(un)mappings()", v3. After the recent issue with vmalloc and tracing code[1] on x86 and a long history of previous issues related to the vmalloc_sync_mappings() interface, I thought the time has come to remove it. Please see [2], [3], and [4] for some other issues in the past. The patches add tracking of page-table directory changes to the vmalloc and ioremap code. Depending on which page-table levels changes have been made, a new per-arch function is called: arch_sync_kernel_mappings(). On x86-64 with 4-level paging, this function will not be called more than 64 times in a systems runtime (because vmalloc-space takes 64 PGD entries which are only populated, but never cleared). As a side effect this also allows to get rid of vmalloc faults on x86, making it safe to touch vmalloc'ed memory in the page-fault handler. Note that this potentially includes per-cpu memory. This patch (of 7): Add page-table allocation functions which will keep track of changed directory entries. They are needed for new PGD, P4D, PUD, and PMD entries and will be used in vmalloc and ioremap code to decide whether any changes in the kernel mappings need to be synchronized between page-tables in the system. Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Cc: "H . Peter Anvin" Cc: Dave Hansen Cc: "Rafael J. Wysocki" Cc: Arnd Bergmann Cc: Steven Rostedt (VMware) Cc: Vlastimil Babka Cc: Michal Hocko Cc: Matthew Wilcox (Oracle) Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20200515140023.25469-1-joro@8bytes.org Link: http://lkml.kernel.org/r/20200515140023.25469-2-joro@8bytes.org Signed-off-by: Linus Torvalds --- include/asm-generic/5level-fixup.h | 5 +++-- include/asm-generic/pgtable.h | 23 +++++++++++++++++++ include/linux/mm.h | 46 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 2 deletions(-) diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h index 4c74b1c1d13b..58046ddc08d0 100644 --- a/include/asm-generic/5level-fixup.h +++ b/include/asm-generic/5level-fixup.h @@ -17,8 +17,9 @@ ((unlikely(pgd_none(*(p4d))) && __pud_alloc(mm, p4d, address)) ? \ NULL : pud_offset(p4d, address)) -#define p4d_alloc(mm, pgd, address) (pgd) -#define p4d_offset(pgd, start) (pgd) +#define p4d_alloc(mm, pgd, address) (pgd) +#define p4d_alloc_track(mm, pgd, address, mask) (pgd) +#define p4d_offset(pgd, start) (pgd) #ifndef __ASSEMBLY__ static inline int p4d_none(p4d_t p4d) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 8c5f9c29698b..db7df7daa0d8 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1213,6 +1213,29 @@ static inline bool arch_has_pfn_modify_check(void) # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif +/* + * Page Table Modification bits for pgtbl_mod_mask. + * + * These are used by the p?d_alloc_track*() set of functions an in the generic + * vmalloc/ioremap code to track at which page-table levels entries have been + * modified. Based on that the code can better decide when vmalloc and ioremap + * mapping changes need to be synchronized to other page-tables in the system. + */ +#define __PGTBL_PGD_MODIFIED 0 +#define __PGTBL_P4D_MODIFIED 1 +#define __PGTBL_PUD_MODIFIED 2 +#define __PGTBL_PMD_MODIFIED 3 +#define __PGTBL_PTE_MODIFIED 4 + +#define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED) +#define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED) +#define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED) +#define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED) +#define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED) + +/* Page-Table Modification Mask */ +typedef unsigned int pgtbl_mod_mask; + #endif /* !__ASSEMBLY__ */ #ifndef io_remap_pfn_range diff --git a/include/linux/mm.h b/include/linux/mm.h index ebbb0acbeee2..fda41eb7f1c8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2091,13 +2091,54 @@ static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d, return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ? NULL : pud_offset(p4d, address); } + +static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd, + unsigned long address, + pgtbl_mod_mask *mod_mask) + +{ + if (unlikely(pgd_none(*pgd))) { + if (__p4d_alloc(mm, pgd, address)) + return NULL; + *mod_mask |= PGTBL_PGD_MODIFIED; + } + + return p4d_offset(pgd, address); +} + #endif /* !__ARCH_HAS_5LEVEL_HACK */ +static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d, + unsigned long address, + pgtbl_mod_mask *mod_mask) +{ + if (unlikely(p4d_none(*p4d))) { + if (__pud_alloc(mm, p4d, address)) + return NULL; + *mod_mask |= PGTBL_P4D_MODIFIED; + } + + return pud_offset(p4d, address); +} + static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? NULL: pmd_offset(pud, address); } + +static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud, + unsigned long address, + pgtbl_mod_mask *mod_mask) +{ + if (unlikely(pud_none(*pud))) { + if (__pmd_alloc(mm, pud, address)) + return NULL; + *mod_mask |= PGTBL_PUD_MODIFIED; + } + + return pmd_offset(pud, address); +} #endif /* CONFIG_MMU */ #if USE_SPLIT_PTE_PTLOCKS @@ -2213,6 +2254,11 @@ static inline void pgtable_pte_page_dtor(struct page *page) ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ NULL: pte_offset_kernel(pmd, address)) +#define pte_alloc_kernel_track(pmd, address, mask) \ + ((unlikely(pmd_none(*(pmd))) && \ + (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\ + NULL: pte_offset_kernel(pmd, address)) + #if USE_SPLIT_PMD_PTLOCKS static struct page *pmd_to_page(pmd_t *pmd) -- cgit v1.2.3 From 2ba3e6947aed9bb9575eb1603c0ac6e39185d32a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 1 Jun 2020 21:52:22 -0700 Subject: mm/vmalloc: track which page-table levels were modified Track at which levels in the page-table entries were modified by vmap/vunmap. After the page-table has been modified, use that information do decide whether the new arch_sync_kernel_mappings() needs to be called. [akpm@linux-foundation.org: map_kernel_range_noflush() needs the arch_sync_kernel_mappings() call] Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dave Hansen Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/20200515140023.25469-3-joro@8bytes.org Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 16 +++++++++ mm/vmalloc.c | 95 +++++++++++++++++++++++++++++++++++-------------- 2 files changed, 85 insertions(+), 26 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 3332926295d4..0efc35dc5b25 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -133,6 +133,22 @@ extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, void vmalloc_sync_mappings(void); void vmalloc_sync_unmappings(void); +/* + * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values + * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings() + * needs to be called. + */ +#ifndef ARCH_PAGE_TABLE_SYNC_MASK +#define ARCH_PAGE_TABLE_SYNC_MASK 0 +#endif + +/* + * There is no default implementation for arch_sync_kernel_mappings(). It is + * relied upon the compiler to optimize calls out if ARCH_PAGE_TABLE_SYNC_MASK + * is 0. + */ +void arch_sync_kernel_mappings(unsigned long start, unsigned long end); + /* * Lowlevel-APIs (not for driver use!) */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 410bfe26ee73..154e3396154c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -69,7 +69,8 @@ static void free_work(struct work_struct *w) /*** Page table manipulation functions ***/ -static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) +static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pte_t *pte; @@ -78,59 +79,81 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; } -static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) +static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; + int cleared; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_clear_huge(pmd)) + + cleared = pmd_clear_huge(pmd); + if (cleared || pmd_bad(*pmd)) + *mask |= PGTBL_PMD_MODIFIED; + + if (cleared) continue; if (pmd_none_or_clear_bad(pmd)) continue; - vunmap_pte_range(pmd, addr, next); + vunmap_pte_range(pmd, addr, next, mask); } while (pmd++, addr = next, addr != end); } -static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end) +static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; + int cleared; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); - if (pud_clear_huge(pud)) + + cleared = pud_clear_huge(pud); + if (cleared || pud_bad(*pud)) + *mask |= PGTBL_PUD_MODIFIED; + + if (cleared) continue; if (pud_none_or_clear_bad(pud)) continue; - vunmap_pmd_range(pud, addr, next); + vunmap_pmd_range(pud, addr, next, mask); } while (pud++, addr = next, addr != end); } -static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end) +static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; + int cleared; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); - if (p4d_clear_huge(p4d)) + + cleared = p4d_clear_huge(p4d); + if (cleared || p4d_bad(*p4d)) + *mask |= PGTBL_P4D_MODIFIED; + + if (cleared) continue; if (p4d_none_or_clear_bad(p4d)) continue; - vunmap_pud_range(p4d, addr, next); + vunmap_pud_range(p4d, addr, next, mask); } while (p4d++, addr = next, addr != end); } /** * unmap_kernel_range_noflush - unmap kernel VM area - * @addr: start of the VM area to unmap + * @start: start of the VM area to unmap * @size: size of the VM area to unmap * * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify @@ -141,24 +164,33 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end) * for calling flush_cache_vunmap() on to-be-mapped areas before calling this * function and flush_tlb_kernel_range() after. */ -void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) +void unmap_kernel_range_noflush(unsigned long start, unsigned long size) { - unsigned long end = addr + size; + unsigned long end = start + size; unsigned long next; pgd_t *pgd; + unsigned long addr = start; + pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); + start = addr; pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); + if (pgd_bad(*pgd)) + mask |= PGTBL_PGD_MODIFIED; if (pgd_none_or_clear_bad(pgd)) continue; - vunmap_p4d_range(pgd, addr, next); + vunmap_p4d_range(pgd, addr, next, &mask); } while (pgd++, addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); } static int vmap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pte_t *pte; @@ -167,7 +199,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, * callers keep track of where we're up to. */ - pte = pte_alloc_kernel(pmd, addr); + pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { @@ -180,55 +212,59 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; return 0; } static int vmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; - pmd = pmd_alloc(&init_mm, pud, addr); + pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) + if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } static int vmap_pud_range(p4d_t *p4d, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; - pud = pud_alloc(&init_mm, p4d, addr); + pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); - if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) + if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page **pages, int *nr) + unsigned long end, pgprot_t prot, struct page **pages, int *nr, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; - p4d = p4d_alloc(&init_mm, pgd, addr); + p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); - if (vmap_pud_range(p4d, addr, next, prot, pages, nr)) + if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask)) return -ENOMEM; } while (p4d++, addr = next, addr != end); return 0; @@ -255,21 +291,28 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, int map_kernel_range_noflush(unsigned long addr, unsigned long size, pgprot_t prot, struct page **pages) { + unsigned long start = addr; unsigned long end = addr + size; unsigned long next; pgd_t *pgd; int err = 0; int nr = 0; + pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr); + if (pgd_bad(*pgd)) + mask |= PGTBL_PGD_MODIFIED; + err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); if (err) return err; } while (pgd++, addr = next, addr != end); + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + return 0; } -- cgit v1.2.3 From 6c0c7d2b365b21a413f6d75772a8a4a2c7d36916 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 1 Jun 2020 21:52:26 -0700 Subject: mm/ioremap: track which page-table levels were modified Track at which levels in the page-table entries were modified by ioremap_page_range(). After the page-table has been modified, use that information do decide whether the new arch_sync_kernel_mappings() needs to be called. The iounmap path re-uses vunmap(), which has already been taken care of. Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dave Hansen Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/20200515140023.25469-4-joro@8bytes.org Signed-off-by: Linus Torvalds --- lib/ioremap.c | 46 +++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/lib/ioremap.c b/lib/ioremap.c index 3f0e18543de8..ad485f08173b 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -61,13 +61,14 @@ static inline int ioremap_pmd_enabled(void) { return 0; } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) { pte_t *pte; u64 pfn; pfn = phys_addr >> PAGE_SHIFT; - pte = pte_alloc_kernel(pmd, addr); + pte = pte_alloc_kernel_track(pmd, addr, mask); if (!pte) return -ENOMEM; do { @@ -75,6 +76,7 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; return 0; } @@ -101,21 +103,24 @@ static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr, } static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; - pmd = pmd_alloc(&init_mm, pud, addr); + pmd = pmd_alloc_track(&init_mm, pud, addr, mask); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) + if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) { + *mask |= PGTBL_PMD_MODIFIED; continue; + } - if (ioremap_pte_range(pmd, addr, next, phys_addr, prot)) + if (ioremap_pte_range(pmd, addr, next, phys_addr, prot, mask)) return -ENOMEM; } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); return 0; @@ -144,21 +149,24 @@ static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr, } static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; - pud = pud_alloc(&init_mm, p4d, addr); + pud = pud_alloc_track(&init_mm, p4d, addr, mask); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); - if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) + if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) { + *mask |= PGTBL_PUD_MODIFIED; continue; + } - if (ioremap_pmd_range(pud, addr, next, phys_addr, prot)) + if (ioremap_pmd_range(pud, addr, next, phys_addr, prot, mask)) return -ENOMEM; } while (pud++, phys_addr += (next - addr), addr = next, addr != end); return 0; @@ -187,21 +195,24 @@ static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr, } static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr, - unsigned long end, phys_addr_t phys_addr, pgprot_t prot) + unsigned long end, phys_addr_t phys_addr, pgprot_t prot, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; - p4d = p4d_alloc(&init_mm, pgd, addr); + p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); - if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) + if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) { + *mask |= PGTBL_P4D_MODIFIED; continue; + } - if (ioremap_pud_range(p4d, addr, next, phys_addr, prot)) + if (ioremap_pud_range(p4d, addr, next, phys_addr, prot, mask)) return -ENOMEM; } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); return 0; @@ -214,6 +225,7 @@ int ioremap_page_range(unsigned long addr, unsigned long start; unsigned long next; int err; + pgtbl_mod_mask mask = 0; might_sleep(); BUG_ON(addr >= end); @@ -222,13 +234,17 @@ int ioremap_page_range(unsigned long addr, pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot); + err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot, + &mask); if (err) break; } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); flush_cache_vmap(start, end); + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + return err; } -- cgit v1.2.3 From 8e19843c36abae08e1e541a65ce53fd2e88499fc Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 1 Jun 2020 21:52:29 -0700 Subject: x86/mm/64: implement arch_sync_kernel_mappings() Implement the function to sync changes in vmalloc and ioremap ranges to all page-tables. Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dave Hansen Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/20200515140023.25469-5-joro@8bytes.org Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable_64_types.h | 2 ++ arch/x86/mm/init_64.c | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 52e5f5f2240d..8f63efb2a2cc 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -159,4 +159,6 @@ extern unsigned int ptrs_per_p4d; #define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t)) +#define ARCH_PAGE_TABLE_SYNC_MASK (pgtable_l5_enabled() ? PGTBL_PGD_MODIFIED : PGTBL_P4D_MODIFIED) + #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 8b5f73f5e207..96274a90c5ff 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -218,6 +218,11 @@ void sync_global_pgds(unsigned long start, unsigned long end) sync_global_pgds_l4(start, end); } +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) +{ + sync_global_pgds(start, end); +} + /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. -- cgit v1.2.3 From 86cf69f1d893d48fdb0382a940f2523409406423 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 1 Jun 2020 21:52:33 -0700 Subject: x86/mm/32: implement arch_sync_kernel_mappings() Implement the function to sync changes in vmalloc and ioremap ranges to all page-tables. Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dave Hansen Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/20200515140023.25469-6-joro@8bytes.org Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable-2level_types.h | 2 ++ arch/x86/include/asm/pgtable-3level_types.h | 2 ++ arch/x86/mm/fault.c | 25 ++++++++++++++++--------- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index 6deb6cd236e3..7f6ccff0ba72 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -20,6 +20,8 @@ typedef union { #define SHARED_KERNEL_PMD 0 +#define ARCH_PAGE_TABLE_SYNC_MASK PGTBL_PMD_MODIFIED + /* * traditional i386 two-level paging structure: */ diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 33845d36897c..80fbb4a9ed87 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -27,6 +27,8 @@ typedef union { #define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI)) #endif +#define ARCH_PAGE_TABLE_SYNC_MASK (SHARED_KERNEL_PMD ? 0 : PGTBL_PMD_MODIFIED) + /* * PGDIR_SHIFT determines what a top-level page table entry can map */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a51df516b87b..edeb2adaf31f 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -190,16 +190,13 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) return pmd_k; } -static void vmalloc_sync(void) +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) { - unsigned long address; - - if (SHARED_KERNEL_PMD) - return; + unsigned long addr; - for (address = VMALLOC_START & PMD_MASK; - address >= TASK_SIZE_MAX && address < VMALLOC_END; - address += PMD_SIZE) { + for (addr = start & PMD_MASK; + addr >= TASK_SIZE_MAX && addr < VMALLOC_END; + addr += PMD_SIZE) { struct page *page; spin_lock(&pgd_lock); @@ -210,13 +207,23 @@ static void vmalloc_sync(void) pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); - vmalloc_sync_one(page_address(page), address); + vmalloc_sync_one(page_address(page), addr); spin_unlock(pgt_lock); } spin_unlock(&pgd_lock); } } +static void vmalloc_sync(void) +{ + unsigned long address; + + if (SHARED_KERNEL_PMD) + return; + + arch_sync_kernel_mappings(VMALLOC_START, VMALLOC_END); +} + void vmalloc_sync_mappings(void) { vmalloc_sync(); -- cgit v1.2.3 From 73f693c3a705756032c2863bfb37570276902d7d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 1 Jun 2020 21:52:36 -0700 Subject: mm: remove vmalloc_sync_(un)mappings() These functions are not needed anymore because the vmalloc and ioremap mappings are now synchronized when they are created or torn down. Remove all callers and function definitions. Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Tested-by: Steven Rostedt (VMware) Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dave Hansen Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Thomas Gleixner Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/20200515140023.25469-7-joro@8bytes.org Signed-off-by: Linus Torvalds --- arch/x86/mm/fault.c | 37 ------------------------------------- drivers/acpi/apei/ghes.c | 6 ------ include/linux/vmalloc.h | 2 -- kernel/notifier.c | 1 - kernel/trace/trace.c | 12 ------------ mm/nommu.c | 12 ------------ mm/vmalloc.c | 21 --------------------- 7 files changed, 91 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index edeb2adaf31f..255fc631b042 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -214,26 +214,6 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end) } } -static void vmalloc_sync(void) -{ - unsigned long address; - - if (SHARED_KERNEL_PMD) - return; - - arch_sync_kernel_mappings(VMALLOC_START, VMALLOC_END); -} - -void vmalloc_sync_mappings(void) -{ - vmalloc_sync(); -} - -void vmalloc_sync_unmappings(void) -{ - vmalloc_sync(); -} - /* * 32-bit: * @@ -336,23 +316,6 @@ out: #else /* CONFIG_X86_64: */ -void vmalloc_sync_mappings(void) -{ - /* - * 64-bit mappings might allocate new p4d/pud pages - * that need to be propagated to all tasks' PGDs. - */ - sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); -} - -void vmalloc_sync_unmappings(void) -{ - /* - * Unmappings never allocate or free p4d/pud pages. - * No work is required here. - */ -} - /* * 64-bit: * diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 24c9642e8fc7..aabe9c5ee515 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -167,12 +167,6 @@ int ghes_estatus_pool_init(int num_ghes) if (!addr) goto err_pool_alloc; - /* - * New allocation must be visible in all pgd before it can be found by - * an NMI allocating from the pool. - */ - vmalloc_sync_mappings(); - rc = gen_pool_add(ghes_estatus_pool, addr, PAGE_ALIGN(len), -1); if (rc) goto err_pool_add; diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 0efc35dc5b25..48bb681e6c2a 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -130,8 +130,6 @@ extern int remap_vmalloc_range_partial(struct vm_area_struct *vma, extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff); -void vmalloc_sync_mappings(void); -void vmalloc_sync_unmappings(void); /* * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values diff --git a/kernel/notifier.c b/kernel/notifier.c index 5989bbb93039..84c987dfbe03 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -519,7 +519,6 @@ NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { - vmalloc_sync_mappings(); return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 29615f15a820..f12e99b387b2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8527,18 +8527,6 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) allocate_snapshot = false; #endif - /* - * Because of some magic with the way alloc_percpu() works on - * x86_64, we need to synchronize the pgd of all the tables, - * otherwise the trace events that happen in x86_64 page fault - * handlers can't cope with accessing the chance that a - * alloc_percpu()'d memory might be touched in the page fault trace - * event. Oh, and we need to audit all other alloc_percpu() and vmalloc() - * calls in tracing, because something might get triggered within a - * page fault trace event! - */ - vmalloc_sync_mappings(); - return 0; } diff --git a/mm/nommu.c b/mm/nommu.c index 371697bf372d..dfae55f41901 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -371,18 +371,6 @@ void vm_unmap_aliases(void) } EXPORT_SYMBOL_GPL(vm_unmap_aliases); -/* - * Implement a stub for vmalloc_sync_[un]mapping() if the architecture - * chose not to have one. - */ -void __weak vmalloc_sync_mappings(void) -{ -} - -void __weak vmalloc_sync_unmappings(void) -{ -} - struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { BUG(); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 154e3396154c..1e94497b7388 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1353,12 +1353,6 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) if (unlikely(valist == NULL)) return false; - /* - * First make sure the mappings are removed from all page-tables - * before they are freed. - */ - vmalloc_sync_unmappings(); - /* * TODO: to calculate a flush range without looping. * The list can be up to lazy_max_pages() elements. @@ -3089,21 +3083,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range); -/* - * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose - * not to have one. - * - * The purpose of this function is to make sure the vmalloc area - * mappings are identical in all page-tables in the system. - */ -void __weak vmalloc_sync_mappings(void) -{ -} - -void __weak vmalloc_sync_unmappings(void) -{ -} - static int f(pte_t *pte, unsigned long addr, void *data) { pte_t ***p = data; -- cgit v1.2.3 From 7f0a002b5a21302d9f4b29ba83c96cd433ff3769 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 1 Jun 2020 21:52:40 -0700 Subject: x86/mm: remove vmalloc faulting Remove fault handling on vmalloc areas, as the vmalloc code now takes care of synchronizing changes to all page-tables in the system. Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dave Hansen Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/20200515140023.25469-8-joro@8bytes.org Signed-off-by: Linus Torvalds --- arch/x86/include/asm/switch_to.h | 23 ------- arch/x86/kernel/setup_percpu.c | 6 +- arch/x86/mm/fault.c | 134 --------------------------------------- arch/x86/mm/pti.c | 8 +-- arch/x86/mm/tlb.c | 37 ----------- 5 files changed, 4 insertions(+), 204 deletions(-) diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 0e059b73437b..9f69cc497f4b 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -12,27 +12,6 @@ struct task_struct *__switch_to_asm(struct task_struct *prev, __visible struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); -/* This runs runs on the previous thread's stack. */ -static inline void prepare_switch_to(struct task_struct *next) -{ -#ifdef CONFIG_VMAP_STACK - /* - * If we switch to a stack that has a top-level paging entry - * that is not present in the current mm, the resulting #PF will - * will be promoted to a double-fault and we'll panic. Probe - * the new stack now so that vmalloc_fault can fix up the page - * tables if needed. This can only happen if we use a stack - * in vmap space. - * - * We assume that the stack is aligned so that it never spans - * more than one top-level paging entry. - * - * To minimize cache pollution, just follow the stack pointer. - */ - READ_ONCE(*(unsigned char *)next->thread.sp); -#endif -} - asmlinkage void ret_from_fork(void); /* @@ -67,8 +46,6 @@ struct fork_frame { #define switch_to(prev, next, last) \ do { \ - prepare_switch_to(next); \ - \ ((last) = __switch_to_asm((prev), (next))); \ } while (0) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index e6d7894ad127..fd945ce78554 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -287,9 +287,9 @@ void __init setup_per_cpu_areas(void) /* * Sync back kernel address range again. We already did this in * setup_arch(), but percpu data also needs to be available in - * the smpboot asm. We can't reliably pick up percpu mappings - * using vmalloc_fault(), because exception dispatch needs - * percpu data. + * the smpboot asm and arch_sync_kernel_mappings() doesn't sync to + * swapper_pg_dir on 32-bit. The per-cpu mappings need to be available + * there too. * * FIXME: Can the later sync in setup_cpu_entry_areas() replace * this call? diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 255fc631b042..dffe8e4d3140 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -214,44 +214,6 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end) } } -/* - * 32-bit: - * - * Handle a fault on the vmalloc or module mapping area - */ -static noinline int vmalloc_fault(unsigned long address) -{ - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - pgd_paddr = read_cr3_pa(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - - if (pmd_large(*pmd_k)) - return 0; - - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - - return 0; -} -NOKPROBE_SYMBOL(vmalloc_fault); - /* * Did it hit the DOS screen memory VA from vm86 mode? */ @@ -316,79 +278,6 @@ out: #else /* CONFIG_X86_64: */ -/* - * 64-bit: - * - * Handle a fault on the vmalloc area - */ -static noinline int vmalloc_fault(unsigned long address) -{ - pgd_t *pgd, *pgd_k; - p4d_t *p4d, *p4d_k; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Copy kernel mappings over when needed. This can also - * happen within a race in page table update. In the later - * case just flush: - */ - pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address); - pgd_k = pgd_offset_k(address); - if (pgd_none(*pgd_k)) - return -1; - - if (pgtable_l5_enabled()) { - if (pgd_none(*pgd)) { - set_pgd(pgd, *pgd_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k)); - } - } - - /* With 4-level paging, copying happens on the p4d level. */ - p4d = p4d_offset(pgd, address); - p4d_k = p4d_offset(pgd_k, address); - if (p4d_none(*p4d_k)) - return -1; - - if (p4d_none(*p4d) && !pgtable_l5_enabled()) { - set_p4d(p4d, *p4d_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k)); - } - - BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4); - - pud = pud_offset(p4d, address); - if (pud_none(*pud)) - return -1; - - if (pud_large(*pud)) - return 0; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - return -1; - - if (pmd_large(*pmd)) - return 0; - - pte = pte_offset_kernel(pmd, address); - if (!pte_present(*pte)) - return -1; - - return 0; -} -NOKPROBE_SYMBOL(vmalloc_fault); - #ifdef CONFIG_CPU_SUP_AMD static const char errata93_warning[] = KERN_ERR @@ -1227,29 +1116,6 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, */ WARN_ON_ONCE(hw_error_code & X86_PF_PK); - /* - * We can fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * Before doing this on-demand faulting, ensure that the - * fault is not any of the following: - * 1. A fault on a PTE with a reserved bit set. - * 2. A fault caused by a user-mode access. (Do not demand- - * fault kernel memory due to user-mode accesses). - * 3. A fault caused by a page-level protection violation. - * (A demand fault would be on a non-present page which - * would have X86_PF_PROT==0). - */ - if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { - if (vmalloc_fault(address) >= 0) - return; - } - /* Was the fault spurious, caused by lazy TLB invalidation? */ if (spurious_kernel_fault(hw_error_code, address)) return; diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 843aa10a4cb6..da0fb17a1a36 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -448,13 +448,7 @@ static void __init pti_clone_user_shared(void) * the sp1 and sp2 slots. * * This is done for all possible CPUs during boot to ensure - * that it's propagated to all mms. If we were to add one of - * these mappings during CPU hotplug, we would need to take - * some measure to make sure that every mm that subsequently - * ran on that CPU would have the relevant PGD entry in its - * pagetables. The usual vmalloc_fault() mechanism would not - * work for page faults taken in entry_SYSCALL_64 before RSP - * is set up. + * that it's propagated to all mms. */ unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 66f96f21a7b6..f3fe261e5936 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -161,34 +161,6 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, local_irq_restore(flags); } -static void sync_current_stack_to_mm(struct mm_struct *mm) -{ - unsigned long sp = current_stack_pointer; - pgd_t *pgd = pgd_offset(mm, sp); - - if (pgtable_l5_enabled()) { - if (unlikely(pgd_none(*pgd))) { - pgd_t *pgd_ref = pgd_offset_k(sp); - - set_pgd(pgd, *pgd_ref); - } - } else { - /* - * "pgd" is faked. The top level entries are "p4d"s, so sync - * the p4d. This compiles to approximately the same code as - * the 5-level case. - */ - p4d_t *p4d = p4d_offset(pgd, sp); - - if (unlikely(p4d_none(*p4d))) { - pgd_t *pgd_ref = pgd_offset_k(sp); - p4d_t *p4d_ref = p4d_offset(pgd_ref, sp); - - set_p4d(p4d, *p4d_ref); - } - } -} - static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) { unsigned long next_tif = task_thread_info(next)->flags; @@ -377,15 +349,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, */ cond_ibpb(tsk); - if (IS_ENABLED(CONFIG_VMAP_STACK)) { - /* - * If our current stack is in vmalloc space and isn't - * mapped in the new pgd, we'll double-fault. Forcibly - * map it. - */ - sync_current_stack_to_mm(next); - } - /* * Stop remote flushes for the previous mm. * Skip kernel threads; we never send init_mm TLB flushing IPIs, -- cgit v1.2.3 From ca734cc67e3c8ce20eb75dd1e45ae86e10be102b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 1 Jun 2020 21:52:43 -0700 Subject: kasan: fix clang compilation warning due to stack protector KASAN uses a single cc-option invocation to disable both conserve-stack and stack-protector flags. The former flag is not present in Clang, which causes cc-option to fail, and results in stack-protector being enabled. Fix by using separate cc-option calls for each flag. Also collect all flags in a variable to avoid calling cc-option multiple times for different files. Reported-by: Qian Cai Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Reviewed-by: Marco Elver Link: http://lkml.kernel.org/r/c2f0c8e4048852ae014f4a391d96ca42d27e3255.1590779332.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- mm/kasan/Makefile | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index de3121848ddf..d532c2587731 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -15,14 +15,19 @@ CFLAGS_REMOVE_tags_report.o = $(CC_FLAGS_FTRACE) # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 -CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING -CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING -CFLAGS_generic_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING -CFLAGS_init.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING -CFLAGS_quarantine.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING -CFLAGS_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING -CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING -CFLAGS_tags_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING +CC_FLAGS_KASAN_RUNTIME := $(call cc-option, -fno-conserve-stack) +CC_FLAGS_KASAN_RUNTIME += $(call cc-option, -fno-stack-protector) +# Disable branch tracing to avoid recursion. +CC_FLAGS_KASAN_RUNTIME += -DDISABLE_BRANCH_PROFILING + +CFLAGS_common.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_generic.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_generic_report.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_init.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_quarantine.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_report.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_tags.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_tags_report.o := $(CC_FLAGS_KASAN_RUNTIME) obj-$(CONFIG_KASAN) := common.o init.o report.o obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o -- cgit v1.2.3 From 9380ce246a052a1e00121cd480028b6907aeae38 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 1 Jun 2020 21:52:46 -0700 Subject: ubsan: entirely disable alignment checks under UBSAN_TRAP Commit 8d58f222e85f ("ubsan: disable UBSAN_ALIGNMENT under COMPILE_TEST") tried to fix the pathological results of UBSAN_ALIGNMENT with UBSAN_TRAP (which objtool would rightly scream about), but it made an assumption about how COMPILE_TEST gets set (it is not set for randconfig). As a result, we need a bigger hammer here: just don't allow the alignment checks with the trap mode. Fixes: 8d58f222e85f ("ubsan: disable UBSAN_ALIGNMENT under COMPILE_TEST") Reported-by: Randy Dunlap Signed-off-by: Kees Cook Signed-off-by: Andrew Morton Acked-by: Randy Dunlap Cc: Josh Poimboeuf Cc: Dmitry Vyukov Cc: Elena Petrova Link: http://lkml.kernel.org/r/202005291236.000FCB6@keescook Link: https://lore.kernel.org/lkml/742521db-1e8c-0d7a-1ed4-a908894fb497@infradead.org/ Signed-off-by: Linus Torvalds --- lib/Kconfig.ubsan | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index 929211039bac..27bcc2568c95 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -63,7 +63,7 @@ config UBSAN_SANITIZE_ALL config UBSAN_ALIGNMENT bool "Enable checks for pointers alignment" default !HAVE_EFFICIENT_UNALIGNED_ACCESS - depends on !X86 || !COMPILE_TEST + depends on !UBSAN_TRAP help This option enables the check of unaligned memory accesses. Enabling this option on architectures that support unaligned -- cgit v1.2.3 From 86fea8b49494a06a99c6a7511b73b97adbaf4a5b Mon Sep 17 00:00:00 2001 From: Jing Xia Date: Mon, 1 Jun 2020 21:52:49 -0700 Subject: mm/mm_init.c: report kasan-tag information stored in page->flags The pageflags_layout_usage shows incorrect message by means of mminit_loglevel when Kasan runs in the mode of software tag-based enabled with CONFIG_KASAN_SW_TAGS. This patch corrects it and reports kasan-tag information. Signed-off-by: Jing Xia Signed-off-by: Andrew Morton Cc: Chunyan Zhang Cc: Orson Zhai Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Link: http://lkml.kernel.org/r/1586929370-10838-1-git-send-email-jing.xia.mail@gmail.com Signed-off-by: Linus Torvalds --- mm/mm_init.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index 7da6991d9435..435e5f794b3b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -67,26 +67,30 @@ void __init mminit_verify_pageflags_layout(void) unsigned long or_mask, add_mask; shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT; + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH + - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", - "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", SECTIONS_WIDTH, NODES_WIDTH, ZONES_WIDTH, LAST_CPUPID_WIDTH, + KASAN_TAG_WIDTH, NR_PAGEFLAGS); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", - "Section %d Node %d Zone %d Lastcpupid %d\n", + "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", SECTIONS_SHIFT, NODES_SHIFT, ZONES_SHIFT, - LAST_CPUPID_SHIFT); + LAST_CPUPID_SHIFT, + KASAN_TAG_WIDTH); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", - "Section %lu Node %lu Zone %lu Lastcpupid %lu\n", + "Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n", (unsigned long)SECTIONS_PGSHIFT, (unsigned long)NODES_PGSHIFT, (unsigned long)ZONES_PGSHIFT, - (unsigned long)LAST_CPUPID_PGSHIFT); + (unsigned long)LAST_CPUPID_PGSHIFT, + (unsigned long)KASAN_TAG_PGSHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", "Node/Zone ID: %lu -> %lu\n", (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), -- cgit v1.2.3 From 4fba37586e4e73f9f9a855e610e151ef7da2b481 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 1 Jun 2020 21:52:53 -0700 Subject: kasan: move kasan_report() into report.c The kasan_report() functions belongs to report.c, as it's a common functions that does error reporting. Reported-by: Leon Romanovsky Signed-off-by: Andrey Konovalov Signed-off-by: Andrew Morton Tested-by: Leon Romanovsky Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Leon Romanovsky Link: http://lkml.kernel.org/r/78a81fde6eeda9db72a7fd55fbc33173a515e4b1.1589297433.git.andreyknvl@google.com Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 19 ------------------- mm/kasan/report.c | 22 ++++++++++++++++++++-- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 2906358e42f0..757d4074fe28 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include @@ -613,24 +612,6 @@ void kasan_free_shadow(const struct vm_struct *vm) } #endif -extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); -extern bool report_enabled(void); - -bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) -{ - unsigned long flags = user_access_save(); - bool ret = false; - - if (likely(report_enabled())) { - __kasan_report(addr, size, is_write, ip); - ret = true; - } - - user_access_restore(flags); - - return ret; -} - #ifdef CONFIG_MEMORY_HOTPLUG static bool shadow_mapped(unsigned long addr) { diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 80f23c9da6b0..51ec45407a0b 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -454,7 +455,7 @@ static void print_shadow_for_address(const void *addr) } } -bool report_enabled(void) +static bool report_enabled(void) { if (current->kasan_depth) return false; @@ -479,7 +480,8 @@ void kasan_report_invalid_free(void *object, unsigned long ip) end_report(&flags); } -void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) +static void __kasan_report(unsigned long addr, size_t size, bool is_write, + unsigned long ip) { struct kasan_access_info info; void *tagged_addr; @@ -518,6 +520,22 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon end_report(&flags); } +bool kasan_report(unsigned long addr, size_t size, bool is_write, + unsigned long ip) +{ + unsigned long flags = user_access_save(); + bool ret = false; + + if (likely(report_enabled())) { + __kasan_report(addr, size, is_write, ip); + ret = true; + } + + user_access_restore(flags); + + return ret; +} + #ifdef CONFIG_KASAN_INLINE /* * With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high -- cgit v1.2.3