diff options
Diffstat (limited to 'fs')
105 files changed, 769 insertions, 635 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index c718b2e2de0e..5b4847bd2fbb 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -368,6 +368,7 @@ config GRACE_PERIOD config LOCKD tristate depends on FILE_LOCKING + select CRC32 select GRACE_PERIOD config LOCKD_V4 diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 691e0ae607a1..8c6130789fde 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -348,9 +348,9 @@ static int afs_dynroot_readdir(struct file *file, struct dir_context *ctx) } if ((unsigned long long)ctx->pos <= AFS_MAX_DYNROOT_CELL_INO) { - rcu_read_lock(); + down_read(&net->cells_lock); ret = afs_dynroot_readdir_cells(net, ctx); - rcu_read_unlock(); + up_read(&net->cells_lock); } return ret; } diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index bf1c94e51dd0..07709b0d7688 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -4,7 +4,7 @@ config BCACHEFS_FS depends on BLOCK select EXPORTFS select CLOSURES - select LIBCRC32C + select CRC32 select CRC64 select FS_POSIX_ACL select LZ4_COMPRESS @@ -15,10 +15,9 @@ config BCACHEFS_FS select ZLIB_INFLATE select ZSTD_COMPRESS select ZSTD_DECOMPRESS - select CRYPTO select CRYPTO_LIB_SHA256 - select CRYPTO_CHACHA20 - select CRYPTO_POLY1305 + select CRYPTO_LIB_CHACHA + select CRYPTO_LIB_POLY1305 select KEYS select RAID6_PQ select XOR_BLOCKS diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 5d9f208a1bb7..75f7408da173 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -788,6 +788,8 @@ struct bch_fs { unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)]; u64 btrees_lost_data; } sb; + DARRAY(enum bcachefs_metadata_version) + incompat_versions_requested; #ifdef CONFIG_UNICODE struct unicode_map *cf_encoding; @@ -981,8 +983,8 @@ struct bch_fs { mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; size_t zstd_workspace_size; - struct crypto_sync_skcipher *chacha20; - struct crypto_shash *poly1305; + struct bch_key chacha20_key; + bool chacha20_key_set; atomic64_t key_version; diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index d1ad1a7613c9..7d6c971db23c 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -644,8 +644,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, */ static int journal_sort_key_cmp(const void *_l, const void *_r) { - cond_resched(); - const struct journal_key *l = _l; const struct journal_key *r = _r; @@ -689,7 +687,8 @@ void bch2_journal_keys_put(struct bch_fs *c) static void __journal_keys_sort(struct journal_keys *keys) { - sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL); + sort_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]), + journal_sort_key_cmp, NULL); cond_resched(); diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 8c9fdb7263fe..86acf037590c 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -183,7 +183,7 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, return; if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { - if (!c->chacha20) + if (!c->chacha20_key_set) return; struct nonce nonce = btree_nonce(&bn->keys, 0); @@ -398,7 +398,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) bch2_print_string_as_lines(KERN_INFO, buf.buf); } - sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); + sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); dst = 0; darray_for_each(f->nodes, i) { @@ -418,7 +418,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) } f->nodes.nr = dst; - sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); + sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); if (0 && c->opts.verbose) { printbuf_reset(&buf); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 55fbeeb8eaaa..44b5fe430370 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1221,7 +1221,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, ret = bch2_disk_reservation_get(c, &as->disk_res, (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), - c->opts.metadata_replicas, + READ_ONCE(c->opts.metadata_replicas), disk_res_flags); if (ret) goto err; diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index adbe576ec77e..0941fb2c026d 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -428,10 +428,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) */ trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); - sort(wb->flushing.keys.data, - wb->flushing.keys.nr, - sizeof(wb->flushing.keys.data[0]), - wb_key_seq_cmp, NULL); + sort_nonatomic(wb->flushing.keys.data, + wb->flushing.keys.nr, + sizeof(wb->flushing.keys.data[0]), + wb_key_seq_cmp, NULL); darray_for_each(wb->flushing.keys, i) { if (!i->journal_seq) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index fea61e60a9ee..4ef261e8db4f 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -37,7 +37,8 @@ void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) void bch2_dev_usage_full_read_fast(struct bch_dev *ca, struct bch_dev_usage_full *usage) { memset(usage, 0, sizeof(*usage)); - acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s()); + acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, + sizeof(struct bch_dev_usage_full) / sizeof(u64)); } static u64 reserve_factor(u64 r) diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 1c38b165f48b..8d75b27a1418 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -242,11 +242,6 @@ static inline u64 dev_buckets_available(struct bch_dev *ca, /* Filesystem usage: */ -static inline unsigned dev_usage_u64s(void) -{ - return sizeof(struct bch_dev_usage) / sizeof(u64); -} - struct bch_fs_usage_short bch2_fs_usage_read_short(struct bch_fs *); diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 3726689093e3..d0a34a097b80 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -7,17 +7,12 @@ #include "super-io.h" #include <linux/crc32c.h> -#include <linux/crypto.h> #include <linux/xxhash.h> #include <linux/key.h> #include <linux/random.h> #include <linux/ratelimit.h> -#include <linux/scatterlist.h> -#include <crypto/algapi.h> #include <crypto/chacha.h> -#include <crypto/hash.h> #include <crypto/poly1305.h> -#include <crypto/skcipher.h> #include <keys/user-type.h> /* @@ -96,116 +91,40 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void * } } -static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm, - struct nonce nonce, - struct scatterlist *sg, size_t len) +static void bch2_chacha20_init(u32 state[CHACHA_STATE_WORDS], + const struct bch_key *key, struct nonce nonce) { - SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); + u32 key_words[CHACHA_KEY_SIZE / sizeof(u32)]; - skcipher_request_set_sync_tfm(req, tfm); - skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, sg, sg, len, nonce.d); + BUILD_BUG_ON(sizeof(key_words) != sizeof(*key)); + memcpy(key_words, key, sizeof(key_words)); + le32_to_cpu_array(key_words, ARRAY_SIZE(key_words)); - int ret = crypto_skcipher_encrypt(req); - if (ret) - pr_err("got error %i from crypto_skcipher_encrypt()", ret); - - return ret; -} - -static inline int do_encrypt(struct crypto_sync_skcipher *tfm, - struct nonce nonce, - void *buf, size_t len) -{ - if (!is_vmalloc_addr(buf)) { - struct scatterlist sg = {}; - - sg_mark_end(&sg); - sg_set_page(&sg, virt_to_page(buf), len, offset_in_page(buf)); - return do_encrypt_sg(tfm, nonce, &sg, len); - } else { - DARRAY_PREALLOCATED(struct scatterlist, 4) sgl; - size_t sgl_len = 0; - int ret; - - darray_init(&sgl); - - while (len) { - unsigned offset = offset_in_page(buf); - struct scatterlist sg = { - .page_link = (unsigned long) vmalloc_to_page(buf), - .offset = offset, - .length = min(len, PAGE_SIZE - offset), - }; + BUILD_BUG_ON(sizeof(nonce) != CHACHA_IV_SIZE); + chacha_init(state, key_words, (const u8 *)nonce.d); - if (darray_push(&sgl, sg)) { - sg_mark_end(&darray_last(sgl)); - ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len); - if (ret) - goto err; - - nonce = nonce_add(nonce, sgl_len); - sgl_len = 0; - sgl.nr = 0; - BUG_ON(darray_push(&sgl, sg)); - } - - buf += sg.length; - len -= sg.length; - sgl_len += sg.length; - } - - sg_mark_end(&darray_last(sgl)); - ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len); -err: - darray_exit(&sgl); - return ret; - } + memzero_explicit(key_words, sizeof(key_words)); } -int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, - void *buf, size_t len) +static void bch2_chacha20(const struct bch_key *key, struct nonce nonce, + void *data, size_t len) { - struct crypto_sync_skcipher *chacha20 = - crypto_alloc_sync_skcipher("chacha20", 0, 0); - int ret; - - ret = PTR_ERR_OR_ZERO(chacha20); - if (ret) { - pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret)); - return ret; - } - - ret = crypto_skcipher_setkey(&chacha20->base, - (void *) key, sizeof(*key)); - if (ret) { - pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret)); - goto err; - } + u32 state[CHACHA_STATE_WORDS]; - ret = do_encrypt(chacha20, nonce, buf, len); -err: - crypto_free_sync_skcipher(chacha20); - return ret; + bch2_chacha20_init(state, key, nonce); + chacha20_crypt(state, data, data, len); + memzero_explicit(state, sizeof(state)); } -static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc, - struct nonce nonce) +static void bch2_poly1305_init(struct poly1305_desc_ctx *desc, + struct bch_fs *c, struct nonce nonce) { - u8 key[POLY1305_KEY_SIZE]; - int ret; + u8 key[POLY1305_KEY_SIZE] = { 0 }; nonce.d[3] ^= BCH_NONCE_POLY; - memset(key, 0, sizeof(key)); - ret = do_encrypt(c->chacha20, nonce, key, sizeof(key)); - if (ret) - return ret; - - desc->tfm = c->poly1305; - crypto_shash_init(desc); - crypto_shash_update(desc, key, sizeof(key)); - return 0; + bch2_chacha20(&c->chacha20_key, nonce, key, sizeof(key)); + poly1305_init(desc, key); } struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, @@ -230,14 +149,13 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, case BCH_CSUM_chacha20_poly1305_80: case BCH_CSUM_chacha20_poly1305_128: { - SHASH_DESC_ON_STACK(desc, c->poly1305); + struct poly1305_desc_ctx dctx; u8 digest[POLY1305_DIGEST_SIZE]; struct bch_csum ret = { 0 }; - gen_poly_key(c, desc, nonce); - - crypto_shash_update(desc, data, len); - crypto_shash_final(desc, digest); + bch2_poly1305_init(&dctx, c, nonce); + poly1305_update(&dctx, data, len); + poly1305_final(&dctx, digest); memcpy(&ret, digest, bch_crc_bytes[type]); return ret; @@ -253,11 +171,12 @@ int bch2_encrypt(struct bch_fs *c, unsigned type, if (!bch2_csum_type_is_encryption(type)) return 0; - if (bch2_fs_inconsistent_on(!c->chacha20, + if (bch2_fs_inconsistent_on(!c->chacha20_key_set, c, "attempting to encrypt without encryption key")) return -BCH_ERR_no_encryption_key; - return do_encrypt(c->chacha20, nonce, data, len); + bch2_chacha20(&c->chacha20_key, nonce, data, len); + return 0; } static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, @@ -296,26 +215,26 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, case BCH_CSUM_chacha20_poly1305_80: case BCH_CSUM_chacha20_poly1305_128: { - SHASH_DESC_ON_STACK(desc, c->poly1305); + struct poly1305_desc_ctx dctx; u8 digest[POLY1305_DIGEST_SIZE]; struct bch_csum ret = { 0 }; - gen_poly_key(c, desc, nonce); + bch2_poly1305_init(&dctx, c, nonce); #ifdef CONFIG_HIGHMEM __bio_for_each_segment(bv, bio, *iter, *iter) { void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; - crypto_shash_update(desc, p, bv.bv_len); + poly1305_update(&dctx, p, bv.bv_len); kunmap_local(p); } #else __bio_for_each_bvec(bv, bio, *iter, *iter) - crypto_shash_update(desc, + poly1305_update(&dctx, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); #endif - crypto_shash_final(desc, digest); + poly1305_final(&dctx, digest); memcpy(&ret, digest, bch_crc_bytes[type]); return ret; @@ -338,43 +257,33 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, { struct bio_vec bv; struct bvec_iter iter; - DARRAY_PREALLOCATED(struct scatterlist, 4) sgl; - size_t sgl_len = 0; + u32 chacha_state[CHACHA_STATE_WORDS]; int ret = 0; - if (bch2_fs_inconsistent_on(!c->chacha20, + if (bch2_fs_inconsistent_on(!c->chacha20_key_set, c, "attempting to encrypt without encryption key")) return -BCH_ERR_no_encryption_key; - darray_init(&sgl); + bch2_chacha20_init(chacha_state, &c->chacha20_key, nonce); bio_for_each_segment(bv, bio, iter) { - struct scatterlist sg = { - .page_link = (unsigned long) bv.bv_page, - .offset = bv.bv_offset, - .length = bv.bv_len, - }; - - if (darray_push(&sgl, sg)) { - sg_mark_end(&darray_last(sgl)); - ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len); - if (ret) - goto err; - - nonce = nonce_add(nonce, sgl_len); - sgl_len = 0; - sgl.nr = 0; - - BUG_ON(darray_push(&sgl, sg)); + void *p; + + /* + * chacha_crypt() assumes that the length is a multiple of + * CHACHA_BLOCK_SIZE on any non-final call. + */ + if (!IS_ALIGNED(bv.bv_len, CHACHA_BLOCK_SIZE)) { + bch_err_ratelimited(c, "bio not aligned for encryption"); + ret = -EIO; + break; } - sgl_len += sg.length; + p = bvec_kmap_local(&bv); + chacha20_crypt(chacha_state, p, p, bv.bv_len); + kunmap_local(p); } - - sg_mark_end(&darray_last(sgl)); - ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len); -err: - darray_exit(&sgl); + memzero_explicit(chacha_state, sizeof(chacha_state)); return ret; } @@ -650,10 +559,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c, } /* decrypt real key: */ - ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), - &sb_key, sizeof(sb_key)); - if (ret) - goto err; + bch2_chacha20(&user_key, bch2_sb_key_nonce(c), &sb_key, sizeof(sb_key)); if (bch2_key_is_encrypted(&sb_key)) { bch_err(c, "incorrect encryption key"); @@ -668,31 +574,6 @@ err: return ret; } -static int bch2_alloc_ciphers(struct bch_fs *c) -{ - if (c->chacha20) - return 0; - - struct crypto_sync_skcipher *chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); - int ret = PTR_ERR_OR_ZERO(chacha20); - if (ret) { - bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret)); - return ret; - } - - struct crypto_shash *poly1305 = crypto_alloc_shash("poly1305", 0, 0); - ret = PTR_ERR_OR_ZERO(poly1305); - if (ret) { - bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret)); - crypto_free_sync_skcipher(chacha20); - return ret; - } - - c->chacha20 = chacha20; - c->poly1305 = poly1305; - return 0; -} - #if 0 /* @@ -797,35 +678,21 @@ err: void bch2_fs_encryption_exit(struct bch_fs *c) { - if (c->poly1305) - crypto_free_shash(c->poly1305); - if (c->chacha20) - crypto_free_sync_skcipher(c->chacha20); + memzero_explicit(&c->chacha20_key, sizeof(c->chacha20_key)); } int bch2_fs_encryption_init(struct bch_fs *c) { struct bch_sb_field_crypt *crypt; - struct bch_key key; - int ret = 0; + int ret; crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); if (!crypt) - goto out; + return 0; - ret = bch2_alloc_ciphers(c); + ret = bch2_decrypt_sb_key(c, crypt, &c->chacha20_key); if (ret) - goto out; - - ret = bch2_decrypt_sb_key(c, crypt, &key); - if (ret) - goto out; - - ret = crypto_skcipher_setkey(&c->chacha20->base, - (void *) &key.key, sizeof(key.key)); - if (ret) - goto out; -out: - memzero_explicit(&key, sizeof(key)); - return ret; + return ret; + c->chacha20_key_set = true; + return 0; } diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 4ac251c8fcd8..1310782d3ae9 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -69,7 +69,6 @@ static inline void bch2_csum_err_msg(struct printbuf *out, bch2_csum_to_text(out, type, expected); } -int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); int bch2_request_key(struct bch_sb *, struct bch_key *); #ifndef __KERNEL__ int bch2_revoke_key(struct bch_sb *); @@ -156,7 +155,7 @@ static inline bool bch2_checksum_type_valid(const struct bch_fs *c, if (type >= BCH_CSUM_NR) return false; - if (bch2_csum_type_is_encryption(type) && !c->chacha20) + if (bch2_csum_type_is_encryption(type) && !c->chacha20_key_set) return false; return true; diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index de02ebf847ec..b211c97238ab 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -607,7 +607,7 @@ void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update prt_newline(out); printbuf_indent_add(out, 2); bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); - prt_printf(out, "read_done:\t\%u\n", m->read_done); + prt_printf(out, "read_done:\t%u\n", m->read_done); bch2_write_op_to_text(out, &m->op); printbuf_indent_sub(out, 2); } diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index bf53a029f356..8488a7578115 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -287,8 +287,8 @@ static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent, EBUG_ON(!dirent->v.d_casefold); EBUG_ON(!cf_name->len); - dirent->v.d_cf_name_block.d_name_len = name->len; - dirent->v.d_cf_name_block.d_cf_name_len = cf_name->len; + dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len); + dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_name->len); memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len); memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0, diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index c8696f01eb14..a615e4852ded 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -287,7 +287,7 @@ x(EIO, mark_stripe) \ x(EIO, stripe_reconstruct) \ x(EIO, key_type_error) \ - x(EIO, extent_poisened) \ + x(EIO, extent_poisoned) \ x(EIO, missing_indirect_extent) \ x(EIO, invalidate_stripe_to_dev) \ x(EIO, no_encryption_key) \ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index ae7c7a177e10..dca2b8425cc0 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -139,7 +139,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return -BCH_ERR_extent_poisened; + return -BCH_ERR_extent_poisoned; rcu_read_lock(); const union bch_extent_entry *entry; diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 19d4599918dc..e3a75dcca60c 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -225,11 +225,26 @@ static void bchfs_read(struct btree_trans *trans, bch2_read_extent(trans, rbio, iter.pos, data_btree, k, offset_into_extent, flags); - swap(rbio->bio.bi_iter.bi_size, bytes); + /* + * Careful there's a landmine here if bch2_read_extent() ever + * starts returning transaction restarts here. + * + * We've changed rbio->bi_iter.bi_size to be "bytes we can read + * from this extent" with the swap call, and we restore it + * below. That restore needs to come before checking for + * errors. + * + * But unlike __bch2_read(), we use the rbio bvec iter, not one + * on the stack, so we can't do the restore right after the + * bch2_read_extent() call: we don't own that iterator anymore + * if BCH_READ_last_fragment is set, since we may have submitted + * that rbio instead of cloning it. + */ if (flags & BCH_READ_last_fragment) break; + swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); err: if (ret && diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index c1553e44e049..14886e1d4d6d 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -69,7 +69,7 @@ static int bch2_inode_flags_set(struct btree_trans *trans, if (ret < 0) return ret; - ret = bch2_request_incompat_feature(c,bcachefs_metadata_version_casefolding); + ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding); if (ret) return ret; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 18308f3d64a1..7b25cedd3e40 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -321,6 +321,31 @@ static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) inode->bi_subvol == BCACHEFS_ROOT_SUBVOL) return false; + /* + * Subvolume roots are special: older versions of subvolume roots may be + * disconnected, it's only the newest version that matters. + * + * We only keep a single dirent pointing to a subvolume root, i.e. + * older versions of snapshots will not have a different dirent pointing + * to the same subvolume root. + * + * This is because dirents that point to subvolumes are only visible in + * the parent subvolume - versioning is not needed - and keeping them + * around would break fsck, because when we're crossing subvolumes we + * don't have a consistent snapshot ID to do check the inode <-> dirent + * relationships. + * + * Thus, a subvolume root that's been renamed after a snapshot will have + * a disconnected older version - that's expected. + * + * Note that taking a snapshot always updates the root inode (to update + * the dirent backpointer), so a subvolume root inode with + * BCH_INODE_has_child_snapshot is never visible. + */ + if (inode->bi_subvol && + (inode->bi_flags & BCH_INODE_has_child_snapshot)) + return false; + return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked); } @@ -1007,6 +1032,23 @@ static int check_inode_dirent_inode(struct btree_trans *trans, if (ret && !bch2_err_matches(ret, ENOENT)) return ret; + if ((ret || dirent_points_to_inode_nowarn(d, inode)) && + inode->bi_subvol && + (inode->bi_flags & BCH_INODE_has_child_snapshot)) { + /* Older version of a renamed subvolume root: we won't have a + * correct dirent for it. That's expected, see + * inode_should_reattach(). + * + * We don't clear the backpointer field when doing the rename + * because there might be arbitrarily many versions in older + * snapshots. + */ + inode->bi_dir = 0; + inode->bi_dir_offset = 0; + *write_inode = true; + goto out; + } + if (fsck_err_on(ret, trans, inode_points_to_missing_dirent, "inode points to missing dirent\n%s", @@ -1027,7 +1069,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans, inode->bi_dir_offset = 0; *write_inode = true; } - +out: ret = 0; fsck_err: bch2_trans_iter_exit(trans, &dirent_iter); diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 417bb0c7bbfa..def4a26a3b45 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -487,6 +487,8 @@ static void bch2_rbio_retry(struct work_struct *work) .inum = rbio->read_pos.inode, }; struct bch_io_failures failed = { .nr = 0 }; + int orig_error = rbio->ret; + struct btree_trans *trans = bch2_trans_get(c); trace_io_read_retry(&rbio->bio); @@ -519,7 +521,9 @@ static void bch2_rbio_retry(struct work_struct *work) if (ret) { rbio->ret = ret; rbio->bio.bi_status = BLK_STS_IOERR; - } else { + } else if (orig_error != -BCH_ERR_data_read_retry_csum_err_maybe_userspace && + orig_error != -BCH_ERR_data_read_ptr_stale_race && + !failed.nr) { struct printbuf buf = PRINTBUF; lockrestart_do(trans, @@ -977,7 +981,8 @@ retry_pick: goto err; } - if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) { + if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && + !c->chacha20_key_set) { struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, orig, read_pos); prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); @@ -1344,14 +1349,16 @@ err: bch2_trans_iter_exit(trans, &iter); - if (ret) { - struct printbuf buf = PRINTBUF; - lockrestart_do(trans, - bch2_inum_offset_err_msg_trans(trans, &buf, inum, - bvec_iter.bi_sector << 9)); - prt_printf(&buf, "read error: %s", bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); + if (unlikely(ret)) { + if (ret != -BCH_ERR_extent_poisoned) { + struct printbuf buf = PRINTBUF; + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, inum, + bvec_iter.bi_sector << 9)); + prt_printf(&buf, "data read error: %s", bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + } rbio->bio.bi_status = BLK_STS_IOERR; rbio->ret = ret; diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 1b7961f4f609..2a54ac79189b 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1460,7 +1460,7 @@ fsck_err: static void journal_advance_devs_to_next_bucket(struct journal *j, struct dev_alloc_list *devs, - unsigned sectors, u64 seq) + unsigned sectors, __le64 seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 79fd18a5a07c..606d684e6f23 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -389,9 +389,9 @@ int bch2_journal_replay(struct bch_fs *c) * Now, replay any remaining keys in the order in which they appear in * the journal, unpinning those journal entries as we go: */ - sort(keys_sorted.data, keys_sorted.nr, - sizeof(keys_sorted.data[0]), - journal_sort_seq_cmp, NULL); + sort_nonatomic(keys_sorted.data, keys_sorted.nr, + sizeof(keys_sorted.data[0]), + journal_sort_seq_cmp, NULL); darray_for_each(keys_sorted, kp) { cond_resched(); @@ -1125,7 +1125,10 @@ int bch2_fs_initialize(struct bch_fs *c) * journal_res_get() will crash if called before this has * set up the journal.pin FIFO and journal.cur pointer: */ - bch2_fs_journal_start(&c->journal, 1); + ret = bch2_fs_journal_start(&c->journal, 1); + if (ret) + goto err; + set_bit(BCH_FS_accounting_replay_done, &c->flags); bch2_journal_set_replay_done(&c->journal); diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 5d43e3504386..dc53d25c7cbb 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -290,8 +290,8 @@ enum bch_fsck_flags { x(btree_node_bkey_bad_u64s, 260, 0) \ x(btree_node_topology_empty_interior_node, 261, 0) \ x(btree_ptr_v2_min_key_bad, 262, 0) \ - x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ - x(snapshot_node_missing, 264, 0) \ + x(btree_root_unreadable_and_scan_found_nothing, 263, FSCK_AUTOFIX) \ + x(snapshot_node_missing, 264, FSCK_AUTOFIX) \ x(dup_backpointer_to_bad_csum_extent, 265, 0) \ x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \ x(sb_clean_entry_overrun, 267, 0) \ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index e27422b6d9c6..25b6bce05c3c 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -73,14 +73,30 @@ int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version v ? 0 : -BCH_ERR_may_not_use_incompat_feature; + mutex_lock(&c->sb_lock); if (!ret) { - mutex_lock(&c->sb_lock); SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); bch2_write_super(c); - mutex_unlock(&c->sb_lock); + } else { + darray_for_each(c->incompat_versions_requested, i) + if (version == *i) + goto out; + + darray_push(&c->incompat_versions_requested, version); + struct printbuf buf = PRINTBUF; + prt_str(&buf, "requested incompat feature "); + bch2_version_to_text(&buf, version); + prt_str(&buf, " currently not enabled"); + prt_printf(&buf, "\n set version_upgrade=incompat to enable"); + + bch_notice(c, "%s", buf.buf); + printbuf_exit(&buf); } +out: + mutex_unlock(&c->sb_lock); + return ret; } diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index a58edde43bee..e8a17ed1615d 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -70,14 +70,10 @@ #include <linux/percpu.h> #include <linux/random.h> #include <linux/sysfs.h> -#include <crypto/hash.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); MODULE_DESCRIPTION("bcachefs filesystem"); -MODULE_SOFTDEP("pre: chacha20"); -MODULE_SOFTDEP("pre: poly1305"); -MODULE_SOFTDEP("pre: xxhash"); const char * const bch2_fs_flag_strs[] = { #define x(n) #n, @@ -593,6 +589,7 @@ static void __bch2_fs_free(struct bch_fs *c) free_percpu(c->online_reserved); } + darray_exit(&c->incompat_versions_requested); darray_exit(&c->btree_roots_extra); free_percpu(c->pcpu); free_percpu(c->usage); @@ -1002,12 +999,6 @@ static void print_mount_opts(struct bch_fs *c) prt_str(&p, "starting version "); bch2_version_to_text(&p, c->sb.version); - if (c->opts.read_only) { - prt_str(&p, " opts="); - first = false; - prt_printf(&p, "ro"); - } - for (i = 0; i < bch2_opts_nr; i++) { const struct bch_option *opt = &bch2_opt_table[i]; u64 v = bch2_opt_get_by_id(&c->opts, i); @@ -1023,6 +1014,11 @@ static void print_mount_opts(struct bch_fs *c) bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); } + if (c->sb.version_incompat_allowed != c->sb.version) { + prt_printf(&p, "\n allowing incompatible features above "); + bch2_version_to_text(&p, c->sb.version_incompat_allowed); + } + bch_info(c, "%s", p.buf); printbuf_exit(&p); } @@ -1767,7 +1763,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) up_write(&c->state_lock); return 0; err: - if (ca->mi.state == BCH_MEMBER_STATE_rw && + if (test_bit(BCH_FS_rw, &c->flags) && + ca->mi.state == BCH_MEMBER_STATE_rw && !percpu_ref_is_zero(&ca->io_ref[READ])) __bch2_dev_read_write(c, ca); up_write(&c->state_lock); diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index fa8515598341..73a2dfb854c5 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -3,9 +3,9 @@ config BTRFS_FS tristate "Btrfs filesystem support" select BLK_CGROUP_PUNT_BIO + select CRC32 select CRYPTO select CRYPTO_CRC32C - select LIBCRC32C select CRYPTO_XXHASH select CRYPTO_SHA256 select CRYPTO_BLAKE2B diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3dd555db3d32..aa58e0663a5d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3853,7 +3853,6 @@ static int write_dev_supers(struct btrfs_device *device, atomic_inc(&device->sb_write_errors); continue; } - ASSERT(folio_order(folio) == 0); offset = offset_in_folio(folio, bytenr); disk_super = folio_address(folio) + offset; @@ -3926,7 +3925,6 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) /* If the folio has been removed, then we know it completed. */ if (IS_ERR(folio)) continue; - ASSERT(folio_order(folio) == 0); /* Folio will be unlocked once the write completes. */ folio_wait_locked(folio); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 262a707d8990..71b8a825c447 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2104,15 +2104,20 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * will always return true. * So here we need to do extra page alignment for * filemap_range_has_page(). + * + * And do not decrease page_lockend right now, as it can be 0. */ const u64 page_lockstart = round_up(lockstart, PAGE_SIZE); - const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; + const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE); while (1) { truncate_pagecache_range(inode, lockstart, lockend); lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); + /* The same page or adjacent pages. */ + if (page_lockend <= page_lockstart) + break; /* * We can't have ordered extents in the range, nor dirty/writeback * pages, because we have locked the inode's VFS lock in exclusive @@ -2124,7 +2129,7 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * we do, unlock the range and retry. */ if (!filemap_range_has_page(inode->i_mapping, page_lockstart, - page_lockend)) + page_lockend - 1)) break; unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a13d81bb56a0..63aeacc54945 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4902,6 +4902,8 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state, &disk_bytenr, &disk_io_size); + if (ret == -EAGAIN) + goto out_acct; if (ret < 0 && ret != -EIOCBQUEUED) goto out_free; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f948f4f6431c..e17bcb034595 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3803,7 +3803,7 @@ out: if (ret) { if (inode) iput(&inode->vfs_inode); - inode = ERR_PTR(ret); + return ERR_PTR(ret); } return &inode->vfs_inode; } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 11dbd7be6a3b..c0a0b8b063d0 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -204,7 +204,7 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, btrfs_blocks_per_folio(fs_info, folio); \ \ btrfs_subpage_assert(fs_info, folio, start, len); \ - __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ + __start_bit = offset_in_folio(folio, start) >> fs_info->sectorsize_bits; \ __start_bit += blocks_per_folio * btrfs_bitmap_nr_##name; \ __start_bit; \ }) @@ -666,7 +666,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, btrfs_blocks_per_folio(fs_info, folio); \ const struct btrfs_subpage *subpage = folio_get_private(folio); \ \ - ASSERT(blocks_per_folio < BITS_PER_LONG); \ + ASSERT(blocks_per_folio <= BITS_PER_LONG); \ *dst = bitmap_read(subpage->bitmaps, \ blocks_per_folio * btrfs_bitmap_nr_##name, \ blocks_per_folio); \ diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 40709e2a44fc..7121d8c7a318 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1139,8 +1139,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) subvol_name = btrfs_get_subvol_name_from_objectid(info, btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); if (!IS_ERR(subvol_name)) { - seq_puts(seq, ",subvol="); - seq_escape(seq, subvol_name, " \t\n\\"); + seq_show_option(seq, "subvol", subvol_name); kfree(subvol_name); } return 0; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 43979891f7c8..2b66a6130269 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -2235,7 +2235,7 @@ int btrfs_verify_level_key(struct extent_buffer *eb, btrfs_err(fs_info, "tree level mismatch detected, bytenr=%llu level expected=%u has=%u", eb->start, check->level, found_level); - return -EIO; + return -EUCLEAN; } if (!check->has_first_key) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index fb8b8b29c169..4a3e02b49f29 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1277,7 +1277,7 @@ struct zone_info { static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, struct zone_info *info, unsigned long *active, - struct btrfs_chunk_map *map) + struct btrfs_chunk_map *map, bool new) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; struct btrfs_device *device; @@ -1307,6 +1307,8 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, return 0; } + ASSERT(!new || btrfs_dev_is_empty_zone(device, info->physical)); + /* This zone will be used for allocation, so mark this zone non-empty. */ btrfs_dev_clear_zone_empty(device, info->physical); @@ -1319,6 +1321,18 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, * to determine the allocation offset within the zone. */ WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size)); + + if (new) { + sector_t capacity; + + capacity = bdev_zone_capacity(device->bdev, info->physical >> SECTOR_SHIFT); + up_read(&dev_replace->rwsem); + info->alloc_offset = 0; + info->capacity = capacity << SECTOR_SHIFT; + + return 0; + } + nofs_flag = memalloc_nofs_save(); ret = btrfs_get_dev_zone(device, info->physical, &zone); memalloc_nofs_restore(nofs_flag); @@ -1588,7 +1602,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } for (i = 0; i < map->num_stripes; i++) { - ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map); + ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map, new); if (ret) goto out; @@ -1659,7 +1673,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) * stripe. */ cache->alloc_offset = cache->zone_capacity; - ret = 0; } out: diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c index bf935e25bdbe..b48525680e73 100644 --- a/fs/cachefiles/key.c +++ b/fs/cachefiles/key.c @@ -8,7 +8,7 @@ #include <linux/slab.h> #include "internal.h" -static const char cachefiles_charmap[64] = +static const char cachefiles_charmap[64] __nonstring = "0123456789" /* 0 - 9 */ "abcdefghijklmnopqrstuvwxyz" /* 10 - 35 */ "ABCDEFGHIJKLMNOPQRSTUVWXYZ" /* 36 - 61 */ diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 7249d70e1a43..3e7def3d31c1 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -3,7 +3,7 @@ config CEPH_FS tristate "Ceph distributed file system" depends on INET select CEPH_LIB - select LIBCRC32C + select CRC32 select CRYPTO_AES select CRYPTO select NETFS_SUPPORT @@ -396,6 +396,7 @@ static inline unsigned long dax_folio_put(struct folio *folio) order = folio_order(folio); if (!order) return 0; + folio_reset_order(folio); for (i = 0; i < (1UL << order); i++) { struct dev_pagemap *pgmap = page_pgmap(&folio->page); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 42e4d6eeb29f..9c20d78e41f6 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -89,12 +89,12 @@ enum { }; static const struct fs_parameter_spec devpts_param_specs[] = { - fsparam_u32 ("gid", Opt_gid), + fsparam_gid ("gid", Opt_gid), fsparam_s32 ("max", Opt_max), fsparam_u32oct ("mode", Opt_mode), fsparam_flag ("newinstance", Opt_newinstance), fsparam_u32oct ("ptmxmode", Opt_ptmxmode), - fsparam_u32 ("uid", Opt_uid), + fsparam_uid ("uid", Opt_uid), {} }; diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 331e49cd1b8d..8f68ec49ad89 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -3,8 +3,8 @@ config EROFS_FS tristate "EROFS filesystem support" depends on BLOCK + select CRC32 select FS_IOMAP - select LIBCRC32C help EROFS (Enhanced Read-Only File System) is a lightweight read-only file system with modern designs (e.g. no buffer heads, inline diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 9581e9bf8192..767fb4acdc93 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -56,7 +56,7 @@ struct erofs_super_block { union { __le16 rootnid_2b; /* nid of root directory */ __le16 blocks_hi; /* (48BIT on) blocks count MSB */ - } rb; + } __packed rb; __le64 inos; /* total valid ino # (== f_files - f_favail) */ __le64 epoch; /* base seconds used for compact inodes */ __le32 fixed_nsec; /* fixed nanoseconds for compact inodes */ @@ -148,7 +148,7 @@ union erofs_inode_i_nb { __le16 nlink; /* if EROFS_I_NLINK_1_BIT is unset */ __le16 blocks_hi; /* total blocks count MSB */ __le16 startblk_hi; /* starting block number MSB */ -}; +} __packed; /* 32-byte reduced form of an ondisk inode */ struct erofs_inode_compact { @@ -369,9 +369,9 @@ struct z_erofs_map_header { * bit 7 : pack the whole file into packed inode */ __u8 h_clusterbits; - }; + } __packed; __le16 h_extents_hi; /* extent count MSB */ - }; + } __packed; }; enum { diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index bec4b56b3826..4fa0a0121288 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -32,6 +32,8 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) ret = 0; } if (rq->bio.bi_end_io) { + if (ret < 0 && !rq->bio.bi_status) + rq->bio.bi_status = errno_to_blk_status(ret); rq->bio.bi_end_io(&rq->bio); } else { bio_for_each_folio_all(fi, &rq->bio) { diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 0671184d9cf1..5c061aaeeb45 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -725,7 +725,6 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe) lockref_init(&pcl->lockref); /* one ref for this request */ pcl->algorithmformat = map->m_algorithmformat; pcl->pclustersize = map->m_plen; - pcl->pageofs_in = pageofs_in; pcl->length = 0; pcl->partial = true; pcl->next = fe->head; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 8de50df05dfe..14ea47f954f5 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -559,7 +559,8 @@ static int z_erofs_map_blocks_ext(struct inode *inode, pos += sizeof(__le64); lstart = 0; } else { - lstart = map->m_la >> vi->z_lclusterbits; + lstart = round_down(map->m_la, 1 << vi->z_lclusterbits); + pos += (lstart >> vi->z_lclusterbits) * recsz; pa = EROFS_NULL_ADDR; } @@ -614,7 +615,7 @@ static int z_erofs_map_blocks_ext(struct inode *inode, if (last && (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) { map->m_flags |= EROFS_MAP_MAPPED | EROFS_MAP_FRAGMENT; vi->z_fragmentoff = map->m_plen; - if (recsz >= offsetof(struct z_erofs_extent, pstart_lo)) + if (recsz > offsetof(struct z_erofs_extent, pstart_lo)) vi->z_fragmentoff |= map->m_pa << 32; } else if (map->m_plen) { map->m_flags |= EROFS_MAP_MAPPED | diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 100376863a44..4bc264b854c4 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1996,6 +1996,14 @@ static int ep_try_send_events(struct eventpoll *ep, return res; } +static int ep_schedule_timeout(ktime_t *to) +{ + if (to) + return ktime_after(*to, ktime_get()); + else + return 1; +} + /** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. @@ -2103,7 +2111,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, write_unlock_irq(&ep->lock); - if (!eavail) + if (!eavail && ep_schedule_timeout(to)) timed_out = !schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 87ee3a17bd29..e8c5525afc67 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -351,10 +351,9 @@ int ext4_check_blockref(const char *function, unsigned int line, { __le32 *bref = p; unsigned int blk; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - if (ext4_has_feature_journal(inode->i_sb) && - (inode->i_ino == - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) + if (journal && inode == journal->j_inode) return 0; while (bref < p+max) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1dc09ed5d403..94c7d2d828a6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -386,10 +386,11 @@ static int __check_block_validity(struct inode *inode, const char *func, unsigned int line, struct ext4_map_blocks *map) { - if (ext4_has_feature_journal(inode->i_sb) && - (inode->i_ino == - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + + if (journal && inode == journal->j_inode) return 0; + if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " @@ -4724,22 +4725,43 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) inode_set_iversion_queried(inode, val); } -static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags) - +static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, + const char *function, unsigned int line) { + const char *err_str; + if (flags & EXT4_IGET_EA_INODE) { - if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) - return "missing EA_INODE flag"; + if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + err_str = "missing EA_INODE flag"; + goto error; + } if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || - EXT4_I(inode)->i_file_acl) - return "ea_inode with extended attributes"; + EXT4_I(inode)->i_file_acl) { + err_str = "ea_inode with extended attributes"; + goto error; + } } else { - if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) - return "unexpected EA_INODE flag"; + if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + /* + * open_by_handle_at() could provide an old inode number + * that has since been reused for an ea_inode; this does + * not indicate filesystem corruption + */ + if (flags & EXT4_IGET_HANDLE) + return -ESTALE; + err_str = "unexpected EA_INODE flag"; + goto error; + } + } + if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { + err_str = "unexpected bad inode w/o EXT4_IGET_BAD"; + goto error; } - if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) - return "unexpected bad inode w/o EXT4_IGET_BAD"; - return NULL; + return 0; + +error: + ext4_error_inode(inode, function, line, 0, err_str); + return -EFSCORRUPTED; } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, @@ -4751,7 +4773,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, struct ext4_inode_info *ei; struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; - const char *err_str; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; loff_t size; @@ -4780,10 +4801,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { - if ((err_str = check_igot_inode(inode, flags)) != NULL) { - ext4_error_inode(inode, function, line, 0, err_str); + ret = check_igot_inode(inode, flags, function, line); + if (ret) { iput(inode); - return ERR_PTR(-EFSCORRUPTED); + return ERR_PTR(ret); } return inode; } @@ -5065,13 +5086,21 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = -EFSCORRUPTED; goto bad_inode; } - if ((err_str = check_igot_inode(inode, flags)) != NULL) { - ext4_error_inode(inode, function, line, 0, err_str); - ret = -EFSCORRUPTED; - goto bad_inode; + ret = check_igot_inode(inode, flags, function, line); + /* + * -ESTALE here means there is nothing inherently wrong with the inode, + * it's just not an inode we can return for an fhandle lookup. + */ + if (ret == -ESTALE) { + brelse(iloc.bh); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(-ESTALE); } - + if (ret) + goto bad_inode; brelse(iloc.bh); + unlock_new_inode(inode); return inode; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 0d523e9fb3d5..f88424c28194 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3037,10 +3037,8 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) unsigned char blocksize_bits = min_t(unsigned char, sb->s_blocksize_bits, EXT4_MAX_BLOCK_LOG_SIZE); - struct sg { - struct ext4_group_info info; - ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; - } sg; + DEFINE_RAW_FLEX(struct ext4_group_info, sg, bb_counters, + EXT4_MAX_BLOCK_LOG_SIZE + 2); group--; if (group == 0) @@ -3048,7 +3046,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); - i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + i = (blocksize_bits + 2) * sizeof(sg->bb_counters[0]) + sizeof(struct ext4_group_info); grinfo = ext4_get_group_info(sb, group); @@ -3068,14 +3066,14 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) * We care only about free space counters in the group info and * these are safe to access even after the buddy has been unloaded */ - memcpy(&sg, grinfo, i); - seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, - sg.info.bb_fragments, sg.info.bb_first_free); + memcpy(sg, grinfo, i); + seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg->bb_free, + sg->bb_fragments, sg->bb_first_free); for (i = 0; i <= 13; i++) seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? - sg.info.bb_counters[i] : 0); + sg->bb_counters[i] : 0); seq_puts(seq, " ]"); - if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info)) + if (EXT4_MB_GRP_BBITMAP_CORRUPT(sg)) seq_puts(seq, " Block bitmap corrupted!"); seq_putc(seq, '\n'); return 0; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index cb5cb33b1d91..e9712e64ec8f 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1971,7 +1971,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, * split it in half by count; each resulting block will have at least * half the space free. */ - if (i > 0) + if (i >= 0) split = count - move; else split = count/2; diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 2c7b24cb67ad..53c2626e90e7 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1669,6 +1669,9 @@ static int virtio_fs_get_tree(struct fs_context *fsc) unsigned int virtqueue_size; int err = -EIO; + if (!fsc->source) + return invalf(fsc, "No source specified"); + /* This gets a reference on virtio_fs object. This ptr gets installed * in fc->iq->priv. Once fuse_conn is going away, it calls ->put() * to drop the reference to this object. diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index be7f87a8e11a..7bd231d16d4a 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -4,7 +4,6 @@ config GFS2_FS select BUFFER_HEAD select FS_POSIX_ACL select CRC32 - select LIBCRC32C select QUOTACTL select FS_IOMAP help diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index 6add6ebfef89..cb823a8a6ba9 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -67,6 +67,12 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off) else key_len = tree->max_key_len + 1; + if (key_len > sizeof(hfs_btree_key) || key_len < 1) { + memset(key, 0, sizeof(hfs_btree_key)); + pr_err("hfs: Invalid key length: %d\n", key_len); + return; + } + hfs_bnode_read(node, key, off, key_len); } diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 87974d5e6791..079ea80534f7 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -67,6 +67,12 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off) else key_len = tree->max_key_len + 2; + if (key_len > sizeof(hfsplus_btree_key) || key_len < 1) { + memset(key, 0, sizeof(hfsplus_btree_key)); + pr_err("hfsplus: Invalid key length: %d\n", key_len); + return; + } + hfs_bnode_read(node, key, off, key_len); } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 31553372b33a..5b08bd417b28 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -259,7 +259,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, } /* truncate len if we find any trailing uptodate block(s) */ - for ( ; i <= last; i++) { + while (++i <= last) { if (ifs_block_is_uptodate(ifs, i)) { plen -= (last - i + 1) * block_size; last = i - 1; diff --git a/fs/isofs/export.c b/fs/isofs/export.c index 35768a63fb1d..421d247fae52 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -180,7 +180,7 @@ static struct dentry *isofs_fh_to_parent(struct super_block *sb, return NULL; return isofs_export_iget(sb, - fh_len > 2 ? ifid->parent_block : 0, + fh_len > 3 ? ifid->parent_block : 0, ifid->parent_offset, fh_len > 4 ? ifid->parent_generation : 0); } diff --git a/fs/namei.c b/fs/namei.c index 360a86ca1f02..84a0e0b0111c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -125,9 +125,9 @@ #define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) -static inline void initname(struct filename *name) +static inline void initname(struct filename *name, const char __user *uptr) { - name->uptr = NULL; + name->uptr = uptr; name->aname = NULL; atomic_set(&name->refcnt, 1); } @@ -210,7 +210,7 @@ getname_flags(const char __user *filename, int flags) return ERR_PTR(-ENAMETOOLONG); } } - initname(result); + initname(result, filename); audit_getname(result); return result; } @@ -268,7 +268,7 @@ struct filename *getname_kernel(const char * filename) return ERR_PTR(-ENAMETOOLONG); } memcpy((char *)result->name, filename, len); - initname(result); + initname(result, NULL); audit_getname(result); return result; } @@ -1665,27 +1665,20 @@ static struct dentry *lookup_dcache(const struct qstr *name, return dentry; } -/* - * Parent directory has inode locked exclusive. This is one - * and only case when ->lookup() gets called on non in-lookup - * dentries - as the matter of fact, this only gets called - * when directory is guaranteed to have no in-lookup children - * at all. - * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed. - * Will return -EEXIST if name is found and LOOKUP_EXCL was passed. - */ -struct dentry *lookup_one_qstr_excl(const struct qstr *name, - struct dentry *base, - unsigned int flags) +static struct dentry *lookup_one_qstr_excl_raw(const struct qstr *name, + struct dentry *base, + unsigned int flags) { - struct dentry *dentry = lookup_dcache(name, base, flags); + struct dentry *dentry; struct dentry *old; - struct inode *dir = base->d_inode; + struct inode *dir; + dentry = lookup_dcache(name, base, flags); if (dentry) - goto found; + return dentry; /* Don't create child dentry for a dead directory. */ + dir = base->d_inode; if (unlikely(IS_DEADDIR(dir))) return ERR_PTR(-ENOENT); @@ -1698,7 +1691,24 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, dput(dentry); dentry = old; } -found: + return dentry; +} + +/* + * Parent directory has inode locked exclusive. This is one + * and only case when ->lookup() gets called on non in-lookup + * dentries - as the matter of fact, this only gets called + * when directory is guaranteed to have no in-lookup children + * at all. + * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed. + * Will return -EEXIST if name is found and LOOKUP_EXCL was passed. + */ +struct dentry *lookup_one_qstr_excl(const struct qstr *name, + struct dentry *base, unsigned int flags) +{ + struct dentry *dentry; + + dentry = lookup_one_qstr_excl_raw(name, base, flags); if (IS_ERR(dentry)) return dentry; if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) { @@ -2742,23 +2752,48 @@ static int filename_parentat(int dfd, struct filename *name, /* does lookup, returns the object with parent locked */ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path) { + struct path parent_path __free(path_put) = {}; struct dentry *d; struct qstr last; int type, error; - error = filename_parentat(dfd, name, 0, path, &last, &type); + error = filename_parentat(dfd, name, 0, &parent_path, &last, &type); if (error) return ERR_PTR(error); - if (unlikely(type != LAST_NORM)) { - path_put(path); + if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); + inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); + d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); + if (IS_ERR(d)) { + inode_unlock(parent_path.dentry->d_inode); + return d; } - inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - d = lookup_one_qstr_excl(&last, path->dentry, 0); + path->dentry = no_free_ptr(parent_path.dentry); + path->mnt = no_free_ptr(parent_path.mnt); + return d; +} + +struct dentry *kern_path_locked_negative(const char *name, struct path *path) +{ + struct path parent_path __free(path_put) = {}; + struct filename *filename __free(putname) = getname_kernel(name); + struct dentry *d; + struct qstr last; + int type, error; + + error = filename_parentat(AT_FDCWD, filename, 0, &parent_path, &last, &type); + if (error) + return ERR_PTR(error); + if (unlikely(type != LAST_NORM)) + return ERR_PTR(-EINVAL); + inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); + d = lookup_one_qstr_excl_raw(&last, parent_path.dentry, 0); if (IS_ERR(d)) { - inode_unlock(path->dentry->d_inode); - path_put(path); + inode_unlock(parent_path.dentry->d_inode); + return d; } + path->dentry = no_free_ptr(parent_path.dentry); + path->mnt = no_free_ptr(parent_path.mnt); return d; } diff --git a/fs/namespace.c b/fs/namespace.c index 14935a0500a2..d9ca80dcc544 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1830,6 +1830,8 @@ static inline void namespace_lock(void) down_write(&namespace_sem); } +DEFINE_GUARD(namespace_lock, struct rw_semaphore *, namespace_lock(), namespace_unlock()) + enum umount_tree_flags { UMOUNT_SYNC = 1, UMOUNT_PROPAGATE = 2, @@ -2383,7 +2385,7 @@ void dissolve_on_fput(struct vfsmount *mnt) return; } - scoped_guard(rwsem_write, &namespace_sem) { + scoped_guard(namespace_lock, &namespace_sem) { ns = m->mnt_ns; if (!must_dissolve(ns)) return; @@ -5189,8 +5191,8 @@ static void finish_mount_kattr(struct mount_kattr *kattr) mnt_idmap_put(kattr->mnt_idmap); } -static int copy_mount_setattr(struct mount_attr __user *uattr, size_t usize, - struct mount_kattr *kattr) +static int wants_mount_setattr(struct mount_attr __user *uattr, size_t usize, + struct mount_kattr *kattr) { int ret; struct mount_attr attr; @@ -5213,9 +5215,13 @@ static int copy_mount_setattr(struct mount_attr __user *uattr, size_t usize, if (attr.attr_set == 0 && attr.attr_clr == 0 && attr.propagation == 0) - return 0; + return 0; /* Tell caller to not bother. */ + + ret = build_mount_kattr(&attr, usize, kattr); + if (ret < 0) + return ret; - return build_mount_kattr(&attr, usize, kattr); + return 1; } SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, @@ -5247,8 +5253,8 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, if (flags & AT_RECURSIVE) kattr.kflags |= MOUNT_KATTR_RECURSE; - err = copy_mount_setattr(uattr, usize, &kattr); - if (err) + err = wants_mount_setattr(uattr, usize, &kattr); + if (err <= 0) return err; err = user_path_at(dfd, path, kattr.lookup_flags, &target); @@ -5282,15 +5288,17 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, if (flags & AT_RECURSIVE) kattr.kflags |= MOUNT_KATTR_RECURSE; - ret = copy_mount_setattr(uattr, usize, &kattr); - if (ret) + ret = wants_mount_setattr(uattr, usize, &kattr); + if (ret < 0) return ret; - ret = do_mount_setattr(&file->f_path, &kattr); - if (ret) - return ret; + if (ret) { + ret = do_mount_setattr(&file->f_path, &kattr); + if (ret) + return ret; - finish_mount_kattr(&kattr); + finish_mount_kattr(&kattr); + } } fd = get_unused_fd_flags(flags & O_CLOEXEC); diff --git a/fs/netfs/fscache_cache.c b/fs/netfs/fscache_cache.c index 9397ed39b0b4..8f70f8da064b 100644 --- a/fs/netfs/fscache_cache.c +++ b/fs/netfs/fscache_cache.c @@ -372,7 +372,7 @@ void fscache_withdraw_cache(struct fscache_cache *cache) EXPORT_SYMBOL(fscache_withdraw_cache); #ifdef CONFIG_PROC_FS -static const char fscache_cache_states[NR__FSCACHE_CACHE_STATE] = "-PAEW"; +static const char fscache_cache_states[NR__FSCACHE_CACHE_STATE] __nonstring = "-PAEW"; /* * Generate a list of caches in /proc/fs/fscache/caches diff --git a/fs/netfs/fscache_cookie.c b/fs/netfs/fscache_cookie.c index d4d4b3a8b106..3d56fc73435f 100644 --- a/fs/netfs/fscache_cookie.c +++ b/fs/netfs/fscache_cookie.c @@ -29,7 +29,7 @@ static LIST_HEAD(fscache_cookie_lru); static DEFINE_SPINLOCK(fscache_cookie_lru_lock); DEFINE_TIMER(fscache_cookie_lru_timer, fscache_cookie_lru_timed_out); static DECLARE_WORK(fscache_cookie_lru_work, fscache_cookie_lru_worker); -static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] = "-LCAIFUWRD"; +static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] __nonstring = "-LCAIFUWRD"; static unsigned int fscache_lru_cookie_timeout = 10 * HZ; void fscache_print_cookie(struct fscache_cookie *cookie, char prefix) diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 4e3e62040831..70ecc8f5f210 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -127,11 +127,13 @@ static int __init netfs_init(void) if (mempool_init_slab_pool(&netfs_subrequest_pool, 100, netfs_subrequest_slab) < 0) goto error_subreqpool; +#ifdef CONFIG_PROC_FS if (!proc_mkdir("fs/netfs", NULL)) goto error_proc; if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL, &netfs_requests_seq_ops)) goto error_procfile; +#endif #ifdef CONFIG_FSCACHE_STATS if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL, netfs_stats_show)) @@ -144,9 +146,11 @@ static int __init netfs_init(void) return 0; error_fscache: +#ifdef CONFIG_PROC_FS error_procfile: remove_proc_subtree("fs/netfs", NULL); error_proc: +#endif mempool_exit(&netfs_subrequest_pool); error_subreqpool: kmem_cache_destroy(netfs_subrequest_slab); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index d3f76101ad4b..07932ce9246c 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -2,6 +2,7 @@ config NFS_FS tristate "NFS client support" depends on INET && FILE_LOCKING && MULTIUSER + select CRC32 select LOCKD select SUNRPC select NFS_COMMON @@ -196,7 +197,6 @@ config NFS_USE_KERNEL_DNS config NFS_DEBUG bool depends on NFS_FS && SUNRPC_DEBUG - select CRC32 default y config NFS_DISABLE_UDP_SUPPORT diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index ec8d32d0e2e9..6655e5f32ec6 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -899,18 +899,11 @@ u64 nfs_timespec_to_change_attr(const struct timespec64 *ts) return ((u64)ts->tv_sec << 30) + ts->tv_nsec; } -#ifdef CONFIG_CRC32 static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid) { return ~crc32_le(0xFFFFFFFF, &stateid->other[0], NFS4_STATEID_OTHER_SIZE); } -#else -static inline u32 nfs_stateid_hash(nfs4_stateid *stateid) -{ - return 0; -} -#endif static inline bool nfs_current_task_exiting(void) { diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 351616c61df5..f9c291e2165c 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -148,16 +148,12 @@ static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst, memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN); } -#ifdef CONFIG_CRC32 /* * nfs_session_id_hash - calculate the crc32 hash for the session id * @session - pointer to session */ #define nfs_session_id_hash(sess_id) \ (~crc32_le(0xFFFFFFFF, &(sess_id)->data[0], sizeof((sess_id)->data))) -#else -#define nfs_session_id_hash(session) (0) -#endif #else /* defined(CONFIG_NFS_V4_1) */ static inline int nfs4_init_session(struct nfs_client *clp) diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 792d3fed1b45..731a88f6313e 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -4,6 +4,7 @@ config NFSD depends on INET depends on FILE_LOCKING depends on FSNOTIFY + select CRC32 select LOCKD select SUNRPC select EXPORTFS diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2041268b398a..59a693f22452 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5430,7 +5430,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) queued = nfsd4_run_cb(&dp->dl_recall); WARN_ON_ONCE(!queued); if (!queued) - nfs4_put_stid(&dp->dl_stid); + refcount_dec(&dp->dl_stid.sc_count); } /* Called from break_lease() with flc_lock held. */ diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 876152a91f12..5103c2f4d225 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -267,7 +267,6 @@ static inline bool fh_fsid_match(const struct knfsd_fh *fh1, return true; } -#ifdef CONFIG_CRC32 /** * knfsd_fh_hash - calculate the crc32 hash for the filehandle * @fh - pointer to filehandle @@ -279,12 +278,6 @@ static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh) { return ~crc32_le(0xFFFFFFFF, fh->fh_raw, fh->fh_size); } -#else -static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh) -{ - return 0; -} -#endif /** * fh_clear_pre_post_attrs - Reset pre/post attributes diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 6f2f8f4cfbbc..aef942a758ce 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -541,8 +541,6 @@ int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d, bool ovl_is_metacopy_dentry(struct dentry *dentry); char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding); int ovl_ensure_verity_loaded(struct path *path); -int ovl_get_verity_xattr(struct ovl_fs *ofs, const struct path *path, - u8 *digest_buf, int *buf_length); int ovl_validate_verity(struct ovl_fs *ofs, struct path *metapath, struct path *datapath); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index b63474d1b064..e19940d649ca 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1138,6 +1138,11 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, return ERR_PTR(-EINVAL); } + if (ctx->nr == ctx->nr_data) { + pr_err("at least one non-data lowerdir is required\n"); + return ERR_PTR(-EINVAL); + } + err = -EINVAL; for (i = 0; i < ctx->nr; i++) { l = &ctx->lower[i]; diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index e69968e88fe7..35892df7335c 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -704,18 +704,12 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server) cifs_free_hash(&server->secmech.md5); cifs_free_hash(&server->secmech.sha512); - if (!SERVER_IS_CHAN(server)) { - if (server->secmech.enc) { - crypto_free_aead(server->secmech.enc); - server->secmech.enc = NULL; - } - - if (server->secmech.dec) { - crypto_free_aead(server->secmech.dec); - server->secmech.dec = NULL; - } - } else { + if (server->secmech.enc) { + crypto_free_aead(server->secmech.enc); server->secmech.enc = NULL; + } + if (server->secmech.dec) { + crypto_free_aead(server->secmech.dec); server->secmech.dec = NULL; } } diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 07c4688ec4c9..3b32116b0b49 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -625,10 +625,8 @@ struct smb_version_operations { bool (*is_status_io_timeout)(char *buf); /* Check for STATUS_NETWORK_NAME_DELETED */ bool (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv); - int (*parse_reparse_point)(struct cifs_sb_info *cifs_sb, - const char *full_path, - struct kvec *rsp_iov, - struct cifs_open_info_data *data); + struct reparse_data_buffer * (*get_reparse_point_buffer)(const struct kvec *rsp_iov, + u32 *plen); int (*create_reparse_symlink)(const unsigned int xid, struct inode *inode, struct dentry *dentry, diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h index 48d0d6f439cf..18d67ab113f0 100644 --- a/fs/smb/client/cifspdu.h +++ b/fs/smb/client/cifspdu.h @@ -2256,6 +2256,8 @@ typedef struct { #define FILE_SUPPORTS_ENCRYPTION 0x00020000 #define FILE_SUPPORTS_OBJECT_IDS 0x00010000 #define FILE_VOLUME_IS_COMPRESSED 0x00008000 +#define FILE_SUPPORTS_POSIX_UNLINK_RENAME 0x00000400 +#define FILE_RETURNS_CLEANUP_RESULT_INFO 0x00000200 #define FILE_SUPPORTS_REMOTE_STORAGE 0x00000100 #define FILE_SUPPORTS_REPARSE_POINTS 0x00000080 #define FILE_SUPPORTS_SPARSE_FILES 0x00000040 diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index cfcc07905bdf..59f6fdfe560e 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -163,6 +163,8 @@ extern int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name, extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); extern int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, struct cifsFileInfo **ret_file); +extern int cifs_get_hardlink_path(struct cifs_tcon *tcon, struct inode *inode, + struct file *file); extern unsigned int smbCalcSize(void *buf); extern int decode_negTokenInit(unsigned char *security_blob, int length, struct TCP_Server_Info *server); diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index f298e86a3c1f..df976ce6aed9 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -300,7 +300,6 @@ cifs_abort_connection(struct TCP_Server_Info *server) server->ssocket->flags); sock_release(server->ssocket); server->ssocket = NULL; - put_net(cifs_net_ns(server)); } server->sequence_number = 0; server->session_estab = false; @@ -1074,13 +1073,9 @@ clean_demultiplex_info(struct TCP_Server_Info *server) msleep(125); if (cifs_rdma_enabled(server)) smbd_destroy(server); - if (server->ssocket) { sock_release(server->ssocket); server->ssocket = NULL; - - /* Release netns reference for the socket. */ - put_net(cifs_net_ns(server)); } if (!list_empty(&server->pending_mid_q)) { @@ -1128,7 +1123,6 @@ clean_demultiplex_info(struct TCP_Server_Info *server) */ } - /* Release netns reference for this server. */ put_net(cifs_net_ns(server)); kfree(server->leaf_fullpath); kfree(server->hostname); @@ -1774,8 +1768,6 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, tcp_ses->ops = ctx->ops; tcp_ses->vals = ctx->vals; - - /* Grab netns reference for this server. */ cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns)); tcp_ses->sign = ctx->sign; @@ -1903,7 +1895,6 @@ smbd_connected: out_err_crypto_release: cifs_crypto_secmech_release(tcp_ses); - /* Release netns reference for this server. */ put_net(cifs_net_ns(tcp_ses)); out_err: @@ -1912,10 +1903,8 @@ out_err: cifs_put_tcp_session(tcp_ses->primary_server, false); kfree(tcp_ses->hostname); kfree(tcp_ses->leaf_fullpath); - if (tcp_ses->ssocket) { + if (tcp_ses->ssocket) sock_release(tcp_ses->ssocket); - put_net(cifs_net_ns(tcp_ses)); - } kfree(tcp_ses); } return ERR_PTR(rc); @@ -2556,6 +2545,8 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) return 0; if (tcon->nodelete != ctx->nodelete) return 0; + if (tcon->posix_extensions != ctx->linux_ext) + return 0; return 1; } @@ -3357,24 +3348,20 @@ generic_ip_connect(struct TCP_Server_Info *server) socket = server->ssocket; } else { struct net *net = cifs_net_ns(server); + struct sock *sk; - rc = sock_create_kern(net, sfamily, SOCK_STREAM, IPPROTO_TCP, &server->ssocket); + rc = __sock_create(net, sfamily, SOCK_STREAM, + IPPROTO_TCP, &server->ssocket, 1); if (rc < 0) { cifs_server_dbg(VFS, "Error %d creating socket\n", rc); return rc; } - /* - * Grab netns reference for the socket. - * - * This reference will be released in several situations: - * - In the failure path before the cifsd thread is started. - * - In the all place where server->socket is released, it is - * also set to NULL. - * - Ultimately in clean_demultiplex_info(), during the final - * teardown. - */ - get_net(net); + sk = server->ssocket->sk; + __netns_tracker_free(net, &sk->ns_tracker, false); + sk->sk_net_refcnt = 1; + get_net_track(net, &sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); /* BB other socket options to set KEEPALIVE, NODELAY? */ cifs_dbg(FYI, "Socket created\n"); @@ -3426,7 +3413,6 @@ generic_ip_connect(struct TCP_Server_Info *server) if (rc < 0) { cifs_dbg(FYI, "Error %d connecting to server\n", rc); trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc); - put_net(cifs_net_ns(server)); sock_release(socket); server->ssocket = NULL; return rc; diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 8407fb108664..9e8f404b9e56 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -1007,6 +1007,11 @@ int cifs_open(struct inode *inode, struct file *file) } else { _cifsFileInfo_put(cfile, true, false); } + } else { + /* hard link on the defeered close file */ + rc = cifs_get_hardlink_path(tcon, inode, file); + if (rc) + cifs_close_deferred_file(CIFS_I(inode)); } if (server->oplocks) @@ -2071,6 +2076,29 @@ cifs_move_llist(struct list_head *source, struct list_head *dest) list_move(li, dest); } +int +cifs_get_hardlink_path(struct cifs_tcon *tcon, struct inode *inode, + struct file *file) +{ + struct cifsFileInfo *open_file = NULL; + struct cifsInodeInfo *cinode = CIFS_I(inode); + int rc = 0; + + spin_lock(&tcon->open_file_lock); + spin_lock(&cinode->open_file_lock); + + list_for_each_entry(open_file, &cinode->openFileList, flist) { + if (file->f_flags == open_file->f_flags) { + rc = -EINVAL; + break; + } + } + + spin_unlock(&cinode->open_file_lock); + spin_unlock(&tcon->open_file_lock); + return rc; +} + void cifs_free_llist(struct list_head *llist) { diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index a00a9d91d0da..75be4b46bc6f 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1203,18 +1203,17 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, goto out; } break; - case IO_REPARSE_TAG_MOUNT_POINT: - cifs_create_junction_fattr(fattr, sb); - rc = 0; - goto out; default: /* Check for cached reparse point data */ if (data->symlink_target || data->reparse.buf) { rc = 0; - } else if (iov && server->ops->parse_reparse_point) { - rc = server->ops->parse_reparse_point(cifs_sb, - full_path, - iov, data); + } else if (iov && server->ops->get_reparse_point_buffer) { + struct reparse_data_buffer *reparse_buf; + u32 reparse_len; + + reparse_buf = server->ops->get_reparse_point_buffer(iov, &reparse_len); + rc = parse_reparse_point(reparse_buf, reparse_len, + cifs_sb, full_path, data); /* * If the reparse point was not handled but it is the * name surrogate which points to directory, then treat @@ -1228,6 +1227,16 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, cifs_create_junction_fattr(fattr, sb); goto out; } + /* + * If the reparse point is unsupported by the Linux SMB + * client then let it process by the SMB server. So mask + * the -EOPNOTSUPP error code. This will allow Linux SMB + * client to send SMB OPEN request to server. If server + * does not support this reparse point too then server + * will return error during open the path. + */ + if (rc == -EOPNOTSUPP) + rc = 0; } if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK && !rc) { diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index 2b9e9885dc42..bb25e77c5540 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -542,12 +542,12 @@ static int wsl_set_reparse_buf(struct reparse_data_buffer **buf, kfree(symname_utf16); return -ENOMEM; } - /* Flag 0x02000000 is unknown, but all wsl symlinks have this value */ - symlink_buf->Flags = cpu_to_le32(0x02000000); - /* PathBuffer is in UTF-8 but without trailing null-term byte */ + /* Version field must be set to 2 (MS-FSCC 2.1.2.7) */ + symlink_buf->Version = cpu_to_le32(2); + /* Target for Version 2 is in UTF-8 but without trailing null-term byte */ symname_utf8_len = utf16s_to_utf8s((wchar_t *)symname_utf16, symname_utf16_len/2, UTF16_LITTLE_ENDIAN, - symlink_buf->PathBuffer, + symlink_buf->Target, symname_utf8_maxlen); *buf = (struct reparse_data_buffer *)symlink_buf; buf_len = sizeof(struct reparse_wsl_symlink_data_buffer) + symname_utf8_len; @@ -1016,29 +1016,36 @@ static int parse_reparse_wsl_symlink(struct reparse_wsl_symlink_data_buffer *buf struct cifs_open_info_data *data) { int len = le16_to_cpu(buf->ReparseDataLength); + int data_offset = offsetof(typeof(*buf), Target) - offsetof(typeof(*buf), Version); int symname_utf8_len; __le16 *symname_utf16; int symname_utf16_len; - if (len <= sizeof(buf->Flags)) { + if (len <= data_offset) { cifs_dbg(VFS, "srv returned malformed wsl symlink buffer\n"); return -EIO; } - /* PathBuffer is in UTF-8 but without trailing null-term byte */ - symname_utf8_len = len - sizeof(buf->Flags); + /* MS-FSCC 2.1.2.7 defines layout of the Target field only for Version 2. */ + if (le32_to_cpu(buf->Version) != 2) { + cifs_dbg(VFS, "srv returned unsupported wsl symlink version %u\n", le32_to_cpu(buf->Version)); + return -EIO; + } + + /* Target for Version 2 is in UTF-8 but without trailing null-term byte */ + symname_utf8_len = len - data_offset; /* * Check that buffer does not contain null byte * because Linux cannot process symlink with null byte. */ - if (strnlen(buf->PathBuffer, symname_utf8_len) != symname_utf8_len) { + if (strnlen(buf->Target, symname_utf8_len) != symname_utf8_len) { cifs_dbg(VFS, "srv returned null byte in wsl symlink target location\n"); return -EIO; } symname_utf16 = kzalloc(symname_utf8_len * 2, GFP_KERNEL); if (!symname_utf16) return -ENOMEM; - symname_utf16_len = utf8s_to_utf16s(buf->PathBuffer, symname_utf8_len, + symname_utf16_len = utf8s_to_utf16s(buf->Target, symname_utf8_len, UTF16_LITTLE_ENDIAN, (wchar_t *) symname_utf16, symname_utf8_len * 2); if (symname_utf16_len < 0) { @@ -1062,8 +1069,6 @@ int parse_reparse_point(struct reparse_data_buffer *buf, const char *full_path, struct cifs_open_info_data *data) { - struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - data->reparse.buf = buf; /* See MS-FSCC 2.1.2 */ @@ -1090,24 +1095,17 @@ int parse_reparse_point(struct reparse_data_buffer *buf, } return 0; default: - cifs_tcon_dbg(VFS | ONCE, "unhandled reparse tag: 0x%08x\n", - le32_to_cpu(buf->ReparseTag)); return -EOPNOTSUPP; } } -int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, - const char *full_path, - struct kvec *rsp_iov, - struct cifs_open_info_data *data) +struct reparse_data_buffer *smb2_get_reparse_point_buffer(const struct kvec *rsp_iov, + u32 *plen) { - struct reparse_data_buffer *buf; struct smb2_ioctl_rsp *io = rsp_iov->iov_base; - u32 plen = le32_to_cpu(io->OutputCount); - - buf = (struct reparse_data_buffer *)((u8 *)io + - le32_to_cpu(io->OutputOffset)); - return parse_reparse_point(buf, plen, cifs_sb, full_path, data); + *plen = le32_to_cpu(io->OutputCount); + return (struct reparse_data_buffer *)((u8 *)io + + le32_to_cpu(io->OutputOffset)); } static bool wsl_to_fattr(struct cifs_open_info_data *data, @@ -1233,16 +1231,6 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, bool ok; switch (tag) { - case IO_REPARSE_TAG_INTERNAL: - if (!(fattr->cf_cifsattrs & ATTR_DIRECTORY)) - return false; - fallthrough; - case IO_REPARSE_TAG_DFS: - case IO_REPARSE_TAG_DFSR: - case IO_REPARSE_TAG_MOUNT_POINT: - /* See cifs_create_junction_fattr() */ - fattr->cf_mode = S_IFDIR | 0711; - break; case IO_REPARSE_TAG_LX_SYMLINK: case IO_REPARSE_TAG_LX_FIFO: case IO_REPARSE_TAG_AF_UNIX: @@ -1262,7 +1250,14 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, fattr->cf_mode |= S_IFLNK; break; default: - return false; + if (!(fattr->cf_cifsattrs & ATTR_DIRECTORY)) + return false; + if (!IS_REPARSE_TAG_NAME_SURROGATE(tag) && + tag != IO_REPARSE_TAG_INTERNAL) + return false; + /* See cifs_create_junction_fattr() */ + fattr->cf_mode = S_IFDIR | 0711; + break; } fattr->cf_dtype = S_DT(fattr->cf_mode); diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h index c0be5ab45a78..08de853b36a8 100644 --- a/fs/smb/client/reparse.h +++ b/fs/smb/client/reparse.h @@ -135,9 +135,6 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, int smb2_mknod_reparse(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev); -int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, - const char *full_path, - struct kvec *rsp_iov, - struct cifs_open_info_data *data); +struct reparse_data_buffer *smb2_get_reparse_point_buffer(const struct kvec *rsp_iov, u32 *len); #endif /* _CIFS_REPARSE_H */ diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index f2ca5963cd9d..b3fa9ee26912 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -680,6 +680,22 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp) *pbcc_area = bcc_ptr; } +static void +ascii_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp) +{ + char *bcc_ptr = *pbcc_area; + + strcpy(bcc_ptr, "Linux version "); + bcc_ptr += strlen("Linux version "); + strcpy(bcc_ptr, init_utsname()->release); + bcc_ptr += strlen(init_utsname()->release) + 1; + + strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); + bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; + + *pbcc_area = bcc_ptr; +} + static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses, const struct nls_table *nls_cp) { @@ -704,6 +720,25 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses, *pbcc_area = bcc_ptr; } +static void ascii_domain_string(char **pbcc_area, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + char *bcc_ptr = *pbcc_area; + int len; + + /* copy domain */ + if (ses->domainName != NULL) { + len = strscpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN); + if (WARN_ON_ONCE(len < 0)) + len = CIFS_MAX_DOMAINNAME_LEN - 1; + bcc_ptr += len; + } /* else we send a null domain name so server will default to its own domain */ + *bcc_ptr = 0; + bcc_ptr++; + + *pbcc_area = bcc_ptr; +} + static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, const struct nls_table *nls_cp) { @@ -749,25 +784,10 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, *bcc_ptr = 0; bcc_ptr++; /* account for null termination */ - /* copy domain */ - if (ses->domainName != NULL) { - len = strscpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN); - if (WARN_ON_ONCE(len < 0)) - len = CIFS_MAX_DOMAINNAME_LEN - 1; - bcc_ptr += len; - } /* else we send a null domain name so server will default to its own domain */ - *bcc_ptr = 0; - bcc_ptr++; - /* BB check for overflow here */ - strcpy(bcc_ptr, "Linux version "); - bcc_ptr += strlen("Linux version "); - strcpy(bcc_ptr, init_utsname()->release); - bcc_ptr += strlen(init_utsname()->release) + 1; - - strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); - bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; + ascii_domain_string(&bcc_ptr, ses, nls_cp); + ascii_oslm_strings(&bcc_ptr, nls_cp); *pbcc_area = bcc_ptr; } @@ -1570,7 +1590,7 @@ sess_auth_kerberos(struct sess_data *sess_data) sess_data->iov[1].iov_len = msg->secblob_len; pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len); - if (ses->capabilities & CAP_UNICODE) { + if (pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) { /* unicode strings must be word aligned */ if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { *bcc_ptr = 0; @@ -1579,8 +1599,8 @@ sess_auth_kerberos(struct sess_data *sess_data) unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp); } else { - /* BB: is this right? */ - ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + ascii_oslm_strings(&bcc_ptr, sess_data->nls_cp); + ascii_domain_string(&bcc_ptr, ses, sess_data->nls_cp); } sess_data->iov[2].iov_len = (long) bcc_ptr - diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index 26df807fbe7a..0adeec652dc1 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -568,6 +568,42 @@ static int cifs_query_path_info(const unsigned int xid, data->reparse_point = le32_to_cpu(fi.Attributes) & ATTR_REPARSE; } +#ifdef CONFIG_CIFS_XATTR + /* + * For WSL CHR and BLK reparse points it is required to fetch + * EA $LXDEV which contains major and minor device numbers. + */ + if (!rc && data->reparse_point) { + struct smb2_file_full_ea_info *ea; + + ea = (struct smb2_file_full_ea_info *)data->wsl.eas; + rc = CIFSSMBQAllEAs(xid, tcon, full_path, SMB2_WSL_XATTR_DEV, + &ea->ea_data[SMB2_WSL_XATTR_NAME_LEN + 1], + SMB2_WSL_XATTR_DEV_SIZE, cifs_sb); + if (rc == SMB2_WSL_XATTR_DEV_SIZE) { + ea->next_entry_offset = cpu_to_le32(0); + ea->flags = 0; + ea->ea_name_length = SMB2_WSL_XATTR_NAME_LEN; + ea->ea_value_length = cpu_to_le16(SMB2_WSL_XATTR_DEV_SIZE); + memcpy(&ea->ea_data[0], SMB2_WSL_XATTR_DEV, SMB2_WSL_XATTR_NAME_LEN + 1); + data->wsl.eas_len = sizeof(*ea) + SMB2_WSL_XATTR_NAME_LEN + 1 + + SMB2_WSL_XATTR_DEV_SIZE; + rc = 0; + } else if (rc >= 0) { + /* It is an error if EA $LXDEV has wrong size. */ + rc = -EINVAL; + } else { + /* + * In all other cases ignore error if fetching + * of EA $LXDEV failed. It is needed only for + * WSL CHR and BLK reparse points and wsl_to_fattr() + * handle the case when EA is missing. + */ + rc = 0; + } + } +#endif + return rc; } @@ -970,18 +1006,13 @@ static int cifs_query_symlink(const unsigned int xid, return rc; } -static int cifs_parse_reparse_point(struct cifs_sb_info *cifs_sb, - const char *full_path, - struct kvec *rsp_iov, - struct cifs_open_info_data *data) +static struct reparse_data_buffer *cifs_get_reparse_point_buffer(const struct kvec *rsp_iov, + u32 *plen) { - struct reparse_data_buffer *buf; TRANSACT_IOCTL_RSP *io = rsp_iov->iov_base; - u32 plen = le16_to_cpu(io->ByteCount); - - buf = (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol + - le32_to_cpu(io->DataOffset)); - return parse_reparse_point(buf, plen, cifs_sb, full_path, data); + *plen = le16_to_cpu(io->ByteCount); + return (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol + + le32_to_cpu(io->DataOffset)); } static bool @@ -1157,7 +1188,7 @@ struct smb_version_operations smb1_operations = { .rename = CIFSSMBRename, .create_hardlink = CIFSCreateHardLink, .query_symlink = cifs_query_symlink, - .parse_reparse_point = cifs_parse_reparse_point, + .get_reparse_point_buffer = cifs_get_reparse_point_buffer, .open = cifs_open_file, .set_fid = cifs_set_fid, .close = cifs_close_file, diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 41d8cd20b25f..2fe8eeb98535 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4555,9 +4555,9 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, return rc; } } else { - if (unlikely(!server->secmech.dec)) - return -EIO; - + rc = smb3_crypto_aead_allocate(server); + if (unlikely(rc)) + return rc; tfm = server->secmech.dec; } @@ -5303,7 +5303,7 @@ struct smb_version_operations smb20_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, - .parse_reparse_point = smb2_parse_reparse_point, + .get_reparse_point_buffer = smb2_get_reparse_point_buffer, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, .create_reparse_symlink = smb2_create_reparse_symlink, @@ -5406,7 +5406,7 @@ struct smb_version_operations smb21_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, - .parse_reparse_point = smb2_parse_reparse_point, + .get_reparse_point_buffer = smb2_get_reparse_point_buffer, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, .create_reparse_symlink = smb2_create_reparse_symlink, @@ -5513,7 +5513,7 @@ struct smb_version_operations smb30_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, - .parse_reparse_point = smb2_parse_reparse_point, + .get_reparse_point_buffer = smb2_get_reparse_point_buffer, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, .create_reparse_symlink = smb2_create_reparse_symlink, @@ -5629,7 +5629,7 @@ struct smb_version_operations smb311_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, - .parse_reparse_point = smb2_parse_reparse_point, + .get_reparse_point_buffer = smb2_get_reparse_point_buffer, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, .create_reparse_symlink = smb2_create_reparse_symlink, diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 81e05db8e4d5..c4d52bebd37d 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -1252,15 +1252,8 @@ SMB2_negotiate(const unsigned int xid, cifs_server_dbg(VFS, "Missing expected negotiate contexts\n"); } - if (server->cipher_type && !rc) { - if (!SERVER_IS_CHAN(server)) { - rc = smb3_crypto_aead_allocate(server); - } else { - /* For channels, just reuse the primary server crypto secmech. */ - server->secmech.enc = server->primary_server->secmech.enc; - server->secmech.dec = server->primary_server->secmech.dec; - } - } + if (server->cipher_type && !rc) + rc = smb3_crypto_aead_allocate(server); neg_exit: free_rsp_buf(resp_buftype, rsp); return rc; diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index 764dca80c15c..f79a5165a7cc 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -1567,13 +1567,13 @@ struct reparse_nfs_data_buffer { __u8 DataBuffer[]; } __packed; -/* For IO_REPARSE_TAG_LX_SYMLINK */ +/* For IO_REPARSE_TAG_LX_SYMLINK - see MS-FSCC 2.1.2.7 */ struct reparse_wsl_symlink_data_buffer { __le32 ReparseTag; __le16 ReparseDataLength; __u16 Reserved; - __le32 Flags; - __u8 PathBuffer[]; /* Variable Length UTF-8 string without nul-term */ + __le32 Version; /* Always 2 */ + __u8 Target[]; /* Variable Length UTF-8 string without nul-term */ } __packed; struct validate_negotiate_info_req { diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index c1f22c129111..83764c230e9d 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -39,8 +39,10 @@ void ksmbd_conn_free(struct ksmbd_conn *conn) xa_destroy(&conn->sessions); kvfree(conn->request_buf); kfree(conn->preauth_info); - if (atomic_dec_and_test(&conn->refcnt)) + if (atomic_dec_and_test(&conn->refcnt)) { + ksmbd_free_transport(conn->transport); kfree(conn); + } } /** diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index f103b1bd0400..81a29857b1e3 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -129,14 +129,6 @@ static void free_opinfo(struct oplock_info *opinfo) kfree(opinfo); } -static inline void opinfo_free_rcu(struct rcu_head *rcu_head) -{ - struct oplock_info *opinfo; - - opinfo = container_of(rcu_head, struct oplock_info, rcu_head); - free_opinfo(opinfo); -} - struct oplock_info *opinfo_get(struct ksmbd_file *fp) { struct oplock_info *opinfo; @@ -157,8 +149,8 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci) if (list_empty(&ci->m_op_list)) return NULL; - rcu_read_lock(); - opinfo = list_first_or_null_rcu(&ci->m_op_list, struct oplock_info, + down_read(&ci->m_lock); + opinfo = list_first_entry(&ci->m_op_list, struct oplock_info, op_entry); if (opinfo) { if (opinfo->conn == NULL || @@ -171,8 +163,7 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci) } } } - - rcu_read_unlock(); + up_read(&ci->m_lock); return opinfo; } @@ -185,7 +176,7 @@ void opinfo_put(struct oplock_info *opinfo) if (!atomic_dec_and_test(&opinfo->refcount)) return; - call_rcu(&opinfo->rcu_head, opinfo_free_rcu); + free_opinfo(opinfo); } static void opinfo_add(struct oplock_info *opinfo) @@ -193,7 +184,7 @@ static void opinfo_add(struct oplock_info *opinfo) struct ksmbd_inode *ci = opinfo->o_fp->f_ci; down_write(&ci->m_lock); - list_add_rcu(&opinfo->op_entry, &ci->m_op_list); + list_add(&opinfo->op_entry, &ci->m_op_list); up_write(&ci->m_lock); } @@ -207,7 +198,7 @@ static void opinfo_del(struct oplock_info *opinfo) write_unlock(&lease_list_lock); } down_write(&ci->m_lock); - list_del_rcu(&opinfo->op_entry); + list_del(&opinfo->op_entry); up_write(&ci->m_lock); } @@ -1347,8 +1338,8 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp, ci = fp->f_ci; op = opinfo_get(fp); - rcu_read_lock(); - list_for_each_entry_rcu(brk_op, &ci->m_op_list, op_entry) { + down_read(&ci->m_lock); + list_for_each_entry(brk_op, &ci->m_op_list, op_entry) { if (brk_op->conn == NULL) continue; @@ -1358,7 +1349,6 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp, if (ksmbd_conn_releasing(brk_op->conn)) continue; - rcu_read_unlock(); if (brk_op->is_lease && (brk_op->o_lease->state & (~(SMB2_LEASE_READ_CACHING_LE | SMB2_LEASE_HANDLE_CACHING_LE)))) { @@ -1388,9 +1378,8 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp, oplock_break(brk_op, SMB2_OPLOCK_LEVEL_NONE, NULL); next: opinfo_put(brk_op); - rcu_read_lock(); } - rcu_read_unlock(); + up_read(&ci->m_lock); if (op) opinfo_put(op); diff --git a/fs/smb/server/oplock.h b/fs/smb/server/oplock.h index 3f64f0787263..9a56eaadd0dd 100644 --- a/fs/smb/server/oplock.h +++ b/fs/smb/server/oplock.h @@ -71,7 +71,6 @@ struct oplock_info { struct list_head lease_entry; wait_queue_head_t oplock_q; /* Other server threads */ wait_queue_head_t oplock_brk; /* oplock breaking wait */ - struct rcu_head rcu_head; }; struct lease_break_info { diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index d24d95d15d87..57839f9708bb 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1602,8 +1602,10 @@ static int krb5_authenticate(struct ksmbd_work *work, if (prev_sess_id && prev_sess_id != sess->id) destroy_previous_session(conn, sess->user, prev_sess_id); - if (sess->state == SMB2_SESSION_VALID) + if (sess->state == SMB2_SESSION_VALID) { ksmbd_free_user(sess->user); + sess->user = NULL; + } retval = ksmbd_krb5_authenticate(sess, in_blob, in_len, out_blob, &out_len); diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h index a3d8a905b07e..d742ba754348 100644 --- a/fs/smb/server/smb_common.h +++ b/fs/smb/server/smb_common.h @@ -72,6 +72,8 @@ #define FILE_SUPPORTS_ENCRYPTION 0x00020000 #define FILE_SUPPORTS_OBJECT_IDS 0x00010000 #define FILE_VOLUME_IS_COMPRESSED 0x00008000 +#define FILE_SUPPORTS_POSIX_UNLINK_RENAME 0x00000400 +#define FILE_RETURNS_CLEANUP_RESULT_INFO 0x00000200 #define FILE_SUPPORTS_REMOTE_STORAGE 0x00000100 #define FILE_SUPPORTS_REPARSE_POINTS 0x00000080 #define FILE_SUPPORTS_SPARSE_FILES 0x00000040 diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index 3f185ae60dc5..2a3e2b0ce557 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -310,7 +310,11 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) server_conf.signing = req->signing; server_conf.tcp_port = req->tcp_port; server_conf.ipc_timeout = req->ipc_timeout * HZ; - server_conf.deadtime = req->deadtime * SMB_ECHO_INTERVAL; + if (check_mul_overflow(req->deadtime, SMB_ECHO_INTERVAL, + &server_conf.deadtime)) { + ret = -EINVAL; + goto out; + } server_conf.share_fake_fscaps = req->share_fake_fscaps; ksmbd_init_domain(req->sub_auth); @@ -337,6 +341,7 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) server_conf.bind_interfaces_only = req->bind_interfaces_only; ret |= ksmbd_tcp_set_interfaces(KSMBD_STARTUP_CONFIG_INTERFACES(req), req->ifc_list_sz); +out: if (ret) { pr_err("Server configuration error: %s %s %s\n", req->netbios_name, req->server_string, diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index 7f38a3c3f5bd..abedf510899a 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -93,17 +93,21 @@ static struct tcp_transport *alloc_transport(struct socket *client_sk) return t; } -static void free_transport(struct tcp_transport *t) +void ksmbd_free_transport(struct ksmbd_transport *kt) { - kernel_sock_shutdown(t->sock, SHUT_RDWR); - sock_release(t->sock); - t->sock = NULL; + struct tcp_transport *t = TCP_TRANS(kt); - ksmbd_conn_free(KSMBD_TRANS(t)->conn); + sock_release(t->sock); kfree(t->iov); kfree(t); } +static void free_transport(struct tcp_transport *t) +{ + kernel_sock_shutdown(t->sock, SHUT_RDWR); + ksmbd_conn_free(KSMBD_TRANS(t)->conn); +} + /** * kvec_array_init() - initialize a IO vector segment * @new: IO vector to be initialized diff --git a/fs/smb/server/transport_tcp.h b/fs/smb/server/transport_tcp.h index 8c9aa624cfe3..1e51675ee1b2 100644 --- a/fs/smb/server/transport_tcp.h +++ b/fs/smb/server/transport_tcp.h @@ -8,6 +8,7 @@ int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz); struct interface *ksmbd_find_netdev_name_iface_list(char *netdev_name); +void ksmbd_free_transport(struct ksmbd_transport *kt); int ksmbd_tcp_init(void); void ksmbd_tcp_destroy(void); diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 8554aa5a1059..391d07da586c 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -479,7 +479,8 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp, int err = 0; if (work->conn->connection_type) { - if (!(fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE))) { + if (!(fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE)) || + S_ISDIR(file_inode(fp->filp)->i_mode)) { pr_err("no right to write(%pD)\n", fp->filp); err = -EACCES; goto out; diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index 8d1f30dcba7e..1f8fa3468173 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -713,12 +713,8 @@ static bool tree_conn_fd_check(struct ksmbd_tree_connect *tcon, static bool ksmbd_durable_scavenger_alive(void) { - mutex_lock(&durable_scavenger_lock); - if (!durable_scavenger_running) { - mutex_unlock(&durable_scavenger_lock); + if (!durable_scavenger_running) return false; - } - mutex_unlock(&durable_scavenger_lock); if (kthread_should_stop()) return false; @@ -799,9 +795,7 @@ static int ksmbd_durable_scavenger(void *dummy) break; } - mutex_lock(&durable_scavenger_lock); durable_scavenger_running = false; - mutex_unlock(&durable_scavenger_lock); module_put(THIS_MODULE); diff --git a/fs/stat.c b/fs/stat.c index f13308bfdc98..3d9222807214 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -204,12 +204,25 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, STATX_ATTR_DAX); idmap = mnt_idmap(path->mnt); - if (inode->i_op->getattr) - return inode->i_op->getattr(idmap, path, stat, - request_mask, - query_flags); + if (inode->i_op->getattr) { + int ret; + + ret = inode->i_op->getattr(idmap, path, stat, request_mask, + query_flags); + if (ret) + return ret; + } else { + generic_fillattr(idmap, request_mask, inode, stat); + } + + /* + * If this is a block device inode, override the filesystem attributes + * with the block device specific parameters that need to be obtained + * from the bdev backing inode. + */ + if (S_ISBLK(stat->mode)) + bdev_statx(path, stat, request_mask); - generic_fillattr(idmap, request_mask, inode, stat); return 0; } EXPORT_SYMBOL(vfs_getattr_nosec); @@ -295,15 +308,6 @@ static int vfs_statx_path(struct path *path, int flags, struct kstat *stat, if (path_mounted(path)) stat->attributes |= STATX_ATTR_MOUNT_ROOT; stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; - - /* - * If this is a block device inode, override the filesystem - * attributes with the block device specific parameters that need to be - * obtained from the bdev backing inode. - */ - if (S_ISBLK(stat->mode)) - bdev_statx(path, stat, request_mask); - return 0; } diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index fffd6fffdce0..ae0ca6858496 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -3,7 +3,7 @@ config XFS_FS tristate "XFS filesystem support" depends on BLOCK select EXPORTFS - select LIBCRC32C + select CRC32 select FS_IOMAP help XFS is a high performance journaling filesystem which originated diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8e7f1b324b3b..1a2b3f06fa71 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -105,6 +105,7 @@ xfs_buf_free( { unsigned int size = BBTOB(bp->b_length); + might_sleep(); trace_xfs_buf_free(bp, _RET_IP_); ASSERT(list_empty(&bp->b_lru)); diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c index b4ffd80b7cb6..dcbfa274e06d 100644 --- a/fs/xfs/xfs_buf_mem.c +++ b/fs/xfs/xfs_buf_mem.c @@ -165,7 +165,7 @@ xmbuf_map_backing_mem( folio_set_dirty(folio); folio_unlock(folio); - bp->b_addr = folio_address(folio); + bp->b_addr = folio_address(folio) + offset_in_folio(folio, pos); return 0; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index edbc521870a1..b4e32f0860b7 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1186,9 +1186,8 @@ xfs_qm_dqflush_done( if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) && (lip->li_lsn == qlip->qli_flush_lsn || test_bit(XFS_LI_FAILED, &lip->li_flags))) { - spin_lock(&ailp->ail_lock); - xfs_clear_li_failed(lip); + clear_bit(XFS_LI_FAILED, &lip->li_flags); if (lip->li_lsn == qlip->qli_flush_lsn) { /* xfs_ail_update_finish() drops the AIL lock */ tail_lsn = xfs_ail_delete_one(ailp, lip); diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index a4bc1642fe56..414b27a86458 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -876,6 +876,7 @@ xfs_getfsmap_rtdev_rmapbt( const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info) { + struct xfs_fsmap key0 = *keys; /* struct copy */ struct xfs_mount *mp = tp->t_mountp; struct xfs_rtgroup *rtg = NULL; struct xfs_btree_cur *bt_cur = NULL; @@ -887,32 +888,46 @@ xfs_getfsmap_rtdev_rmapbt( int error = 0; eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks); - if (keys[0].fmr_physical >= eofs) + if (key0.fmr_physical >= eofs) return 0; + /* + * On zoned filesystems with an internal rt volume, the volume comes + * immediately after the end of the data volume. However, the + * xfs_rtblock_t address space is relative to the start of the data + * device, which means that the first @rtstart fsblocks do not actually + * point anywhere. If a fsmap query comes in with the low key starting + * below @rtstart, report it as "owned by filesystem". + */ rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart); - if (keys[0].fmr_physical < rtstart_daddr) { + if (xfs_has_zoned(mp) && key0.fmr_physical < rtstart_daddr) { struct xfs_fsmap_irec frec = { .owner = XFS_RMAP_OWN_FS, .len_daddr = rtstart_daddr, }; - /* Adjust the low key if we are continuing from where we left off. */ - if (keys[0].fmr_length > 0) { - info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length; - return 0; + /* + * Adjust the start of the query range if we're picking up from + * a previous round, and only emit the record if we haven't + * already gone past. + */ + key0.fmr_physical += key0.fmr_length; + if (key0.fmr_physical < rtstart_daddr) { + error = xfs_getfsmap_helper(tp, info, &frec); + if (error) + return error; + + key0.fmr_physical = rtstart_daddr; } - /* Fabricate an rmap entry for space occupied by the data dev */ - error = xfs_getfsmap_helper(tp, info, &frec); - if (error) - return error; + /* Zero the other fields to avoid further adjustments. */ + key0.fmr_owner = 0; + key0.fmr_offset = 0; + key0.fmr_length = 0; } - start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical); - end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + - min(eofs - 1, keys[1].fmr_physical)); - + start_rtb = xfs_daddr_to_rtb(mp, key0.fmr_physical); + end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical)); info->missing_owner = XFS_FMR_OWN_FREE; /* @@ -920,12 +935,12 @@ xfs_getfsmap_rtdev_rmapbt( * low to the fsmap low key and max out the high key to the end * of the rtgroup. */ - info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); - error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); + info->low.rm_offset = XFS_BB_TO_FSBT(mp, key0.fmr_offset); + error = xfs_fsmap_owner_to_rmap(&info->low, &key0); if (error) return error; - info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length); - xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); + info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, key0.fmr_length); + xfs_getfsmap_set_irec_flags(&info->low, &key0); /* Adjust the low key if we are continuing from where we left off. */ if (info->low.rm_blockcount == 0) { diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 40fc1bf900af..c6cb0b6b9e46 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -1089,13 +1089,7 @@ xfs_iflush_abort( * state. Whilst the inode is in the AIL, it should have a valid buffer * pointer for push operations to access - it is only safe to remove the * inode from the buffer once it has been removed from the AIL. - * - * We also clear the failed bit before removing the item from the AIL - * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer - * references the inode item owns and needs to hold until we've fully - * aborted the inode log item and detached it from the buffer. */ - clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); xfs_trans_ail_delete(&iip->ili_item, 0); /* diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 6493bdb57351..980aabc49512 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2888,7 +2888,7 @@ xlog_force_and_check_iclog( * * 1. the current iclog is active and has no data; the previous iclog * is in the active or dirty state. - * 2. the current iclog is drity, and the previous iclog is in the + * 2. the current iclog is dirty, and the previous iclog is in the * active or dirty state. * * We may sleep if: diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 799b84220ebb..e5192c12e7ac 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -229,6 +229,7 @@ typedef struct xfs_mount { bool m_finobt_nores; /* no per-AG finobt resv. */ bool m_update_sb; /* sb needs update in mount */ unsigned int m_max_open_zones; + unsigned int m_zonegc_low_space; /* * Bitsets of per-fs metadata that have been checked and/or are sick. diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index b7e82d85f043..7a5c5ef2db92 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -718,8 +718,40 @@ max_open_zones_show( } XFS_SYSFS_ATTR_RO(max_open_zones); +static ssize_t +zonegc_low_space_store( + struct kobject *kobj, + const char *buf, + size_t count) +{ + int ret; + unsigned int val; + + ret = kstrtouint(buf, 0, &val); + if (ret) + return ret; + + if (val > 100) + return -EINVAL; + + zoned_to_mp(kobj)->m_zonegc_low_space = val; + + return count; +} + +static ssize_t +zonegc_low_space_show( + struct kobject *kobj, + char *buf) +{ + return sysfs_emit(buf, "%u\n", + zoned_to_mp(kobj)->m_zonegc_low_space); +} +XFS_SYSFS_ATTR_RW(zonegc_low_space); + static struct attribute *xfs_zoned_attrs[] = { ATTR_LIST(max_open_zones), + ATTR_LIST(zonegc_low_space), NULL, }; ATTRIBUTE_GROUPS(xfs_zoned); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 0fcb1828e598..85a649fec6ac 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -909,10 +909,9 @@ xfs_trans_ail_delete( return; } - /* xfs_ail_update_finish() drops the AIL lock */ - xfs_clear_li_failed(lip); + clear_bit(XFS_LI_FAILED, &lip->li_flags); tail_lsn = xfs_ail_delete_one(ailp, lip); - xfs_ail_update_finish(ailp, tail_lsn); + xfs_ail_update_finish(ailp, tail_lsn); /* drops the AIL lock */ } int diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index bd841df93021..f945f0450b16 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -167,32 +167,4 @@ xfs_trans_ail_copy_lsn( } #endif -static inline void -xfs_clear_li_failed( - struct xfs_log_item *lip) -{ - struct xfs_buf *bp = lip->li_buf; - - ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags)); - lockdep_assert_held(&lip->li_ailp->ail_lock); - - if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) { - lip->li_buf = NULL; - xfs_buf_rele(bp); - } -} - -static inline void -xfs_set_li_failed( - struct xfs_log_item *lip, - struct xfs_buf *bp) -{ - lockdep_assert_held(&lip->li_ailp->ail_lock); - - if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) { - xfs_buf_hold(bp); - lip->li_buf = bp; - } -} - #endif /* __XFS_TRANS_PRIV_H__ */ diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 52af234936a2..d509e49b2aaa 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -1201,6 +1201,13 @@ xfs_mount_zones( xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, iz.available + iz.reclaimable); + /* + * The user may configure GC to free up a percentage of unused blocks. + * By default this is 0. GC will always trigger at the minimum level + * for keeping max_open_zones available for data placement. + */ + mp->m_zonegc_low_space = 0; + error = xfs_zone_gc_mount(mp); if (error) goto out_free_zone_info; diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index c5136ea9bb1d..8c541ca71872 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -162,18 +162,30 @@ struct xfs_zone_gc_data { /* * We aim to keep enough zones free in stock to fully use the open zone limit - * for data placement purposes. + * for data placement purposes. Additionally, the m_zonegc_low_space tunable + * can be set to make sure a fraction of the unused blocks are available for + * writing. */ bool xfs_zoned_need_gc( struct xfs_mount *mp) { + s64 available, free; + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE)) return false; - if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) < + + available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); + + if (available < mp->m_groups[XG_TYPE_RTG].blocks * (mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) return true; + + free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); + if (available < mult_frac(free, mp->m_zonegc_low_space, 100)) + return true; + return false; } |