diff options
41 files changed, 1450 insertions, 1302 deletions
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index 23d18b8a49d5..a7e6e14aeb08 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt @@ -58,22 +58,22 @@ Implementation Tips for Filesystem Writers Filesystem support consists of - adding support to mark inodes as being DAX by setting the S_DAX flag in i_flags -- implementing the direct_IO address space operation, and calling - dax_do_io() instead of blockdev_direct_IO() if S_DAX is set +- implementing ->read_iter and ->write_iter operations which use dax_iomap_rw() + when inode has S_DAX flag set - implementing an mmap file operation for DAX files which sets the VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to - include handlers for fault, pmd_fault and page_mkwrite (which should - probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the - appropriate get_block() callback) -- calling dax_truncate_page() instead of block_truncate_page() for DAX files -- calling dax_zero_page_range() instead of zero_user() for DAX files + include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These + handlers should probably call dax_iomap_fault() (for fault and page_mkwrite + handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate + iomap operations. +- calling iomap_zero_range() passing appropriate iomap operations instead of + block_truncate_page() for DAX files - ensuring that there is sufficient locking between reads, writes, truncates and page faults -The get_block() callback passed to the DAX functions may return -uninitialised extents. If it does, it must ensure that simultaneous -calls to get_block() (for example by a page-fault racing with a read() -or a write()) work correctly. +The iomap handlers for allocating blocks must make sure that allocated blocks +are zeroed out and converted to written extents before being returned to avoid +exposure of uninitialized data through mmap. These filesystems may be used for inspiration: - ext2: see Documentation/filesystems/ext2.txt diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 6c0108eb0137..3698ed3146e3 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -351,14 +351,13 @@ nouid32 Disables 32-bit UIDs and GIDs. This is for interoperability with older kernels which only store and expect 16-bit values. -block_validity This options allows to enables/disables the in-kernel +block_validity(*) These options enable or disable the in-kernel noblock_validity facility for tracking filesystem metadata blocks - within internal data structures. This allows multi- - block allocator and other routines to quickly locate - extents which might overlap with filesystem metadata - blocks. This option is intended for debugging - purposes and since it negatively affects the - performance, it is off by default. + within internal data structures. This allows multi- + block allocator and other routines to notice + bugs or corrupted allocation bitmaps which cause + blocks to be allocated which overlap with + filesystem metadata blocks. dioread_lock Controls whether or not ext4 should use the DIO read dioread_nolock locking. If the dioread_nolock option is specified diff --git a/MAINTAINERS b/MAINTAINERS index 1174508ee597..8007e2811264 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5240,6 +5240,7 @@ F: include/linux/fscache*.h FS-CRYPTO: FILE SYSTEM LEVEL ENCRYPTION SUPPORT M: Theodore Y. Ts'o <tytso@mit.edu> M: Jaegeuk Kim <jaegeuk@kernel.org> +L: linux-fsdevel@vger.kernel.org S: Supported F: fs/crypto/ F: include/linux/fscrypto.h diff --git a/fs/Kconfig b/fs/Kconfig index 4bd03a2b0518..8e9e5f4104f4 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -55,7 +55,6 @@ config FS_DAX_PMD depends on FS_DAX depends on ZONE_DEVICE depends on TRANSPARENT_HUGEPAGE - depends on BROKEN endif # BLOCK diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index 92348faf9865..f514978f6688 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -8,9 +8,7 @@ config FS_ENCRYPTION select CRYPTO_XTS select CRYPTO_CTS select CRYPTO_CTR - select CRYPTO_SHA256 select KEYS - select ENCRYPTED_KEYS help Enable encryption of files and directories. This feature is similar to ecryptfs, but it is more memory diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 98f87fe8f186..ac8e4f6a3773 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -27,7 +27,7 @@ #include <linux/bio.h> #include <linux/dcache.h> #include <linux/namei.h> -#include <linux/fscrypto.h> +#include "fscrypt_private.h" static unsigned int num_prealloc_crypto_pages = 32; static unsigned int num_prealloc_crypto_ctxs = 128; @@ -63,7 +63,7 @@ void fscrypt_release_ctx(struct fscrypt_ctx *ctx) { unsigned long flags; - if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) { + if (ctx->flags & FS_CTX_HAS_BOUNCE_BUFFER_FL && ctx->w.bounce_page) { mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool); ctx->w.bounce_page = NULL; } @@ -88,7 +88,7 @@ EXPORT_SYMBOL(fscrypt_release_ctx); * Return: An allocated and initialized encryption context on success; error * value or NULL otherwise. */ -struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) +struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) { struct fscrypt_ctx *ctx = NULL; struct fscrypt_info *ci = inode->i_crypt_info; @@ -121,7 +121,7 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) } else { ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL; } - ctx->flags &= ~FS_WRITE_PATH_FL; + ctx->flags &= ~FS_CTX_HAS_BOUNCE_BUFFER_FL; return ctx; } EXPORT_SYMBOL(fscrypt_get_ctx); @@ -146,9 +146,10 @@ typedef enum { FS_ENCRYPT, } fscrypt_direction_t; -static int do_page_crypto(struct inode *inode, - fscrypt_direction_t rw, pgoff_t index, +static int do_page_crypto(const struct inode *inode, + fscrypt_direction_t rw, u64 lblk_num, struct page *src_page, struct page *dest_page, + unsigned int len, unsigned int offs, gfp_t gfp_flags) { struct { @@ -162,6 +163,8 @@ static int do_page_crypto(struct inode *inode, struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; + BUG_ON(len == 0); + req = skcipher_request_alloc(tfm, gfp_flags); if (!req) { printk_ratelimited(KERN_ERR @@ -175,14 +178,14 @@ static int do_page_crypto(struct inode *inode, page_crypt_complete, &ecr); BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); - xts_tweak.index = cpu_to_le64(index); + xts_tweak.index = cpu_to_le64(lblk_num); memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); sg_init_table(&dst, 1); - sg_set_page(&dst, dest_page, PAGE_SIZE, 0); + sg_set_page(&dst, dest_page, len, offs); sg_init_table(&src, 1); - sg_set_page(&src, src_page, PAGE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak); + sg_set_page(&src, src_page, len, offs); + skcipher_request_set_crypt(req, &src, &dst, len, &xts_tweak); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else @@ -207,34 +210,66 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags) ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); if (ctx->w.bounce_page == NULL) return ERR_PTR(-ENOMEM); - ctx->flags |= FS_WRITE_PATH_FL; + ctx->flags |= FS_CTX_HAS_BOUNCE_BUFFER_FL; return ctx->w.bounce_page; } /** * fscypt_encrypt_page() - Encrypts a page - * @inode: The inode for which the encryption should take place - * @plaintext_page: The page to encrypt. Must be locked. - * @gfp_flags: The gfp flag for memory allocation + * @inode: The inode for which the encryption should take place + * @page: The page to encrypt. Must be locked for bounce-page + * encryption. + * @len: Length of data to encrypt in @page and encrypted + * data in returned page. + * @offs: Offset of data within @page and returned + * page holding encrypted data. + * @lblk_num: Logical block number. This must be unique for multiple + * calls with same inode, except when overwriting + * previously written data. + * @gfp_flags: The gfp flag for memory allocation * - * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx - * encryption context. + * Encrypts @page using the ctx encryption context. Performs encryption + * either in-place or into a newly allocated bounce page. + * Called on the page write path. * - * Called on the page write path. The caller must call + * Bounce page allocation is the default. + * In this case, the contents of @page are encrypted and stored in an + * allocated bounce page. @page has to be locked and the caller must call * fscrypt_restore_control_page() on the returned ciphertext page to * release the bounce buffer and the encryption context. * - * Return: An allocated page with the encrypted content on success. Else, an + * In-place encryption is used by setting the FS_CFLG_OWN_PAGES flag in + * fscrypt_operations. Here, the input-page is returned with its content + * encrypted. + * + * Return: A page with the encrypted content on success. Else, an * error value or NULL. */ -struct page *fscrypt_encrypt_page(struct inode *inode, - struct page *plaintext_page, gfp_t gfp_flags) +struct page *fscrypt_encrypt_page(const struct inode *inode, + struct page *page, + unsigned int len, + unsigned int offs, + u64 lblk_num, gfp_t gfp_flags) + { struct fscrypt_ctx *ctx; - struct page *ciphertext_page = NULL; + struct page *ciphertext_page = page; int err; - BUG_ON(!PageLocked(plaintext_page)); + BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0); + + if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) { + /* with inplace-encryption we just encrypt the page */ + err = do_page_crypto(inode, FS_ENCRYPT, lblk_num, + page, ciphertext_page, + len, offs, gfp_flags); + if (err) + return ERR_PTR(err); + + return ciphertext_page; + } + + BUG_ON(!PageLocked(page)); ctx = fscrypt_get_ctx(inode, gfp_flags); if (IS_ERR(ctx)) @@ -245,10 +280,10 @@ struct page *fscrypt_encrypt_page(struct inode *inode, if (IS_ERR(ciphertext_page)) goto errout; - ctx->w.control_page = plaintext_page; - err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page, - gfp_flags); + ctx->w.control_page = page; + err = do_page_crypto(inode, FS_ENCRYPT, lblk_num, + page, ciphertext_page, + len, offs, gfp_flags); if (err) { ciphertext_page = ERR_PTR(err); goto errout; @@ -265,8 +300,13 @@ errout: EXPORT_SYMBOL(fscrypt_encrypt_page); /** - * f2crypt_decrypt_page() - Decrypts a page in-place - * @page: The page to decrypt. Must be locked. + * fscrypt_decrypt_page() - Decrypts a page in-place + * @inode: The corresponding inode for the page to decrypt. + * @page: The page to decrypt. Must be locked in case + * it is a writeback page (FS_CFLG_OWN_PAGES unset). + * @len: Number of bytes in @page to be decrypted. + * @offs: Start of data in @page. + * @lblk_num: Logical block number. * * Decrypts page in-place using the ctx encryption context. * @@ -274,16 +314,18 @@ EXPORT_SYMBOL(fscrypt_encrypt_page); * * Return: Zero on success, non-zero otherwise. */ -int fscrypt_decrypt_page(struct page *page) +int fscrypt_decrypt_page(const struct inode *inode, struct page *page, + unsigned int len, unsigned int offs, u64 lblk_num) { - BUG_ON(!PageLocked(page)); + if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES)) + BUG_ON(!PageLocked(page)); - return do_page_crypto(page->mapping->host, - FS_DECRYPT, page->index, page, page, GFP_NOFS); + return do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page, len, + offs, GFP_NOFS); } EXPORT_SYMBOL(fscrypt_decrypt_page); -int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, +int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { struct fscrypt_ctx *ctx; @@ -306,7 +348,7 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, while (len--) { err = do_page_crypto(inode, FS_ENCRYPT, lblk, ZERO_PAGE(0), ciphertext_page, - GFP_NOFS); + PAGE_SIZE, 0, GFP_NOFS); if (err) goto errout; @@ -414,7 +456,8 @@ static void completion_pages(struct work_struct *work) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - int ret = fscrypt_decrypt_page(page); + int ret = fscrypt_decrypt_page(page->mapping->host, page, + PAGE_SIZE, 0, page->index); if (ret) { WARN_ON_ONCE(1); @@ -482,17 +525,22 @@ static void fscrypt_destroy(void) /** * fscrypt_initialize() - allocate major buffers for fs encryption. + * @cop_flags: fscrypt operations flags * * We only call this when we start accessing encrypted files, since it * results in memory getting allocated that wouldn't otherwise be used. * * Return: Zero on success, non-zero otherwise. */ -int fscrypt_initialize(void) +int fscrypt_initialize(unsigned int cop_flags) { int i, res = -ENOMEM; - if (fscrypt_bounce_page_pool) + /* + * No need to allocate a bounce page pool if there already is one or + * this FS won't use it. + */ + if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool) return 0; mutex_lock(&fscrypt_init_mutex); @@ -521,7 +569,6 @@ fail: mutex_unlock(&fscrypt_init_mutex); return res; } -EXPORT_SYMBOL(fscrypt_initialize); /** * fscrypt_init() - Set up for fs encryption. diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 9b774f4b50c8..56ad9d195f18 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -12,7 +12,7 @@ #include <linux/scatterlist.h> #include <linux/ratelimit.h> -#include <linux/fscrypto.h> +#include "fscrypt_private.h" /** * fname_crypt_complete() - completion callback for filename crypto @@ -209,7 +209,7 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) +u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen) { int padding = 32; struct fscrypt_info *ci = inode->i_crypt_info; @@ -227,7 +227,7 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size); * Allocates an output buffer that is sufficient for the crypto operation * specified by the context and the direction. */ -int fscrypt_fname_alloc_buffer(struct inode *inode, +int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 ilen, struct fscrypt_str *crypto_str) { unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen); @@ -350,7 +350,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, fname->disk_name.len = iname->len; return 0; } - ret = get_crypt_info(dir); + ret = fscrypt_get_crypt_info(dir); if (ret && ret != -EOPNOTSUPP) return ret; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h new file mode 100644 index 000000000000..aeab032d7d35 --- /dev/null +++ b/fs/crypto/fscrypt_private.h @@ -0,0 +1,93 @@ +/* + * fscrypt_private.h + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption key functions. + * + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + */ + +#ifndef _FSCRYPT_PRIVATE_H +#define _FSCRYPT_PRIVATE_H + +#include <linux/fscrypto.h> + +#define FS_FNAME_CRYPTO_DIGEST_SIZE 32 + +/* Encryption parameters */ +#define FS_XTS_TWEAK_SIZE 16 +#define FS_AES_128_ECB_KEY_SIZE 16 +#define FS_AES_256_GCM_KEY_SIZE 32 +#define FS_AES_256_CBC_KEY_SIZE 32 +#define FS_AES_256_CTS_KEY_SIZE 32 +#define FS_AES_256_XTS_KEY_SIZE 64 +#define FS_MAX_KEY_SIZE 64 + +#define FS_KEY_DESC_PREFIX "fscrypt:" +#define FS_KEY_DESC_PREFIX_SIZE 8 + +#define FS_KEY_DERIVATION_NONCE_SIZE 16 + +/** + * Encryption context for inode + * + * Protector format: + * 1 byte: Protector format (1 = this version) + * 1 byte: File contents encryption mode + * 1 byte: File names encryption mode + * 1 byte: Flags + * 8 bytes: Master Key descriptor + * 16 bytes: Encryption Key derivation nonce + */ +struct fscrypt_context { + u8 format; + u8 contents_encryption_mode; + u8 filenames_encryption_mode; + u8 flags; + u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; +} __packed; + +#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 + +/* This is passed in from userspace into the kernel keyring */ +struct fscrypt_key { + u32 mode; + u8 raw[FS_MAX_KEY_SIZE]; + u32 size; +} __packed; + +/* + * A pointer to this structure is stored in the file system's in-core + * representation of an inode. + */ +struct fscrypt_info { + u8 ci_data_mode; + u8 ci_filename_mode; + u8 ci_flags; + struct crypto_skcipher *ci_ctfm; + struct key *ci_keyring_key; + u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; +}; + +#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 +#define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002 + +struct fscrypt_completion_result { + struct completion completion; + int res; +}; + +#define DECLARE_FS_COMPLETION_RESULT(ecr) \ + struct fscrypt_completion_result ecr = { \ + COMPLETION_INITIALIZER((ecr).completion), 0 } + + +/* crypto.c */ +int fscrypt_initialize(unsigned int cop_flags); + +/* keyinfo.c */ +extern int fscrypt_get_crypt_info(struct inode *); + +#endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 67fb6d8876d0..6eeea1dcba41 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -10,7 +10,7 @@ #include <keys/user-type.h> #include <linux/scatterlist.h> -#include <linux/fscrypto.h> +#include "fscrypt_private.h" static void derive_crypt_complete(struct crypto_async_request *req, int rc) { @@ -178,7 +178,7 @@ static void put_crypt_info(struct fscrypt_info *ci) kmem_cache_free(fscrypt_info_cachep, ci); } -int get_crypt_info(struct inode *inode) +int fscrypt_get_crypt_info(struct inode *inode) { struct fscrypt_info *crypt_info; struct fscrypt_context ctx; @@ -188,7 +188,7 @@ int get_crypt_info(struct inode *inode) u8 *raw_key = NULL; int res; - res = fscrypt_initialize(); + res = fscrypt_initialize(inode->i_sb->s_cop->flags); if (res) return res; @@ -327,7 +327,7 @@ int fscrypt_get_encryption_info(struct inode *inode) (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED) | (1 << KEY_FLAG_DEAD))))) - return get_crypt_info(inode); + return fscrypt_get_crypt_info(inode); return 0; } EXPORT_SYMBOL(fscrypt_get_encryption_info); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 6865663aac69..6ed7c2eebeec 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -10,8 +10,8 @@ #include <linux/random.h> #include <linux/string.h> -#include <linux/fscrypto.h> #include <linux/mount.h> +#include "fscrypt_private.h" static int inode_has_encryption_context(struct inode *inode) { @@ -93,16 +93,19 @@ static int create_encryption_context_from_policy(struct inode *inode, return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); } -int fscrypt_process_policy(struct file *filp, - const struct fscrypt_policy *policy) +int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { + struct fscrypt_policy policy; struct inode *inode = file_inode(filp); int ret; + if (copy_from_user(&policy, arg, sizeof(policy))) + return -EFAULT; + if (!inode_owner_or_capable(inode)) return -EACCES; - if (policy->version != 0) + if (policy.version != 0) return -EINVAL; ret = mnt_want_write_file(filp); @@ -120,9 +123,9 @@ int fscrypt_process_policy(struct file *filp, ret = -ENOTEMPTY; else ret = create_encryption_context_from_policy(inode, - policy); + &policy); } else if (!is_encryption_context_consistent_with_policy(inode, - policy)) { + &policy)) { printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", __func__); @@ -134,11 +137,13 @@ int fscrypt_process_policy(struct file *filp, mnt_drop_write_file(filp); return ret; } -EXPORT_SYMBOL(fscrypt_process_policy); +EXPORT_SYMBOL(fscrypt_ioctl_set_policy); -int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) +int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { + struct inode *inode = file_inode(filp); struct fscrypt_context ctx; + struct fscrypt_policy policy; int res; if (!inode->i_sb->s_cop->get_context || @@ -151,15 +156,18 @@ int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) return -EINVAL; - policy->version = 0; - policy->contents_encryption_mode = ctx.contents_encryption_mode; - policy->filenames_encryption_mode = ctx.filenames_encryption_mode; - policy->flags = ctx.flags; - memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, + policy.version = 0; + policy.contents_encryption_mode = ctx.contents_encryption_mode; + policy.filenames_encryption_mode = ctx.filenames_encryption_mode; + policy.flags = ctx.flags; + memcpy(policy.master_key_descriptor, ctx.master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); + + if (copy_to_user(arg, &policy, sizeof(policy))) + return -EFAULT; return 0; } -EXPORT_SYMBOL(fscrypt_get_policy); +EXPORT_SYMBOL(fscrypt_ioctl_get_policy); int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { @@ -34,25 +34,11 @@ #include <linux/iomap.h> #include "internal.h" -/* - * We use lowest available bit in exceptional entry for locking, other two - * bits to determine entry type. In total 3 special bits. - */ -#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3) -#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) -#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) -#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD) -#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK) -#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) -#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ - RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \ - RADIX_TREE_EXCEPTIONAL_ENTRY)) - /* We choose 4096 entries - same as per-zone page wait tables */ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) -wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; +static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; static int __init init_dax_wait_table(void) { @@ -64,14 +50,6 @@ static int __init init_dax_wait_table(void) } fs_initcall(init_dax_wait_table); -static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, - pgoff_t index) -{ - unsigned long hash = hash_long((unsigned long)mapping ^ index, - DAX_WAIT_TABLE_BITS); - return wait_table + hash; -} - static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) { struct request_queue *q = bdev->bd_queue; @@ -98,209 +76,52 @@ static void dax_unmap_atomic(struct block_device *bdev, blk_queue_exit(bdev->bd_queue); } -struct page *read_dax_sector(struct block_device *bdev, sector_t n) +static int dax_is_pmd_entry(void *entry) { - struct page *page = alloc_pages(GFP_KERNEL, 0); - struct blk_dax_ctl dax = { - .size = PAGE_SIZE, - .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), - }; - long rc; - - if (!page) - return ERR_PTR(-ENOMEM); - - rc = dax_map_atomic(bdev, &dax); - if (rc < 0) - return ERR_PTR(rc); - memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); - dax_unmap_atomic(bdev, &dax); - return page; + return (unsigned long)entry & RADIX_DAX_PMD; } -static bool buffer_written(struct buffer_head *bh) +static int dax_is_pte_entry(void *entry) { - return buffer_mapped(bh) && !buffer_unwritten(bh); + return !((unsigned long)entry & RADIX_DAX_PMD); } -/* - * When ext4 encounters a hole, it returns without modifying the buffer_head - * which means that we can't trust b_size. To cope with this, we set b_state - * to 0 before calling get_block and, if any bit is set, we know we can trust - * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is - * and would save us time calling get_block repeatedly. - */ -static bool buffer_size_valid(struct buffer_head *bh) +static int dax_is_zero_entry(void *entry) { - return bh->b_state != 0; + return (unsigned long)entry & RADIX_DAX_HZP; } - -static sector_t to_sector(const struct buffer_head *bh, - const struct inode *inode) +static int dax_is_empty_entry(void *entry) { - sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); - - return sector; + return (unsigned long)entry & RADIX_DAX_EMPTY; } -static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, - loff_t start, loff_t end, get_block_t get_block, - struct buffer_head *bh) +struct page *read_dax_sector(struct block_device *bdev, sector_t n) { - loff_t pos = start, max = start, bh_max = start; - bool hole = false; - struct block_device *bdev = NULL; - int rw = iov_iter_rw(iter), rc; - long map_len = 0; + struct page *page = alloc_pages(GFP_KERNEL, 0); struct blk_dax_ctl dax = { - .addr = ERR_PTR(-EIO), + .size = PAGE_SIZE, + .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), }; - unsigned blkbits = inode->i_blkbits; - sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) - >> blkbits; - - if (rw == READ) - end = min(end, i_size_read(inode)); - - while (pos < end) { - size_t len; - if (pos == max) { - long page = pos >> PAGE_SHIFT; - sector_t block = page << (PAGE_SHIFT - blkbits); - unsigned first = pos - (block << blkbits); - long size; - - if (pos == bh_max) { - bh->b_size = PAGE_ALIGN(end - pos); - bh->b_state = 0; - rc = get_block(inode, block, bh, rw == WRITE); - if (rc) - break; - if (!buffer_size_valid(bh)) - bh->b_size = 1 << blkbits; - bh_max = pos - first + bh->b_size; - bdev = bh->b_bdev; - /* - * We allow uninitialized buffers for writes - * beyond EOF as those cannot race with faults - */ - WARN_ON_ONCE( - (buffer_new(bh) && block < file_blks) || - (rw == WRITE && buffer_unwritten(bh))); - } else { - unsigned done = bh->b_size - - (bh_max - (pos - first)); - bh->b_blocknr += done >> blkbits; - bh->b_size -= done; - } - - hole = rw == READ && !buffer_written(bh); - if (hole) { - size = bh->b_size - first; - } else { - dax_unmap_atomic(bdev, &dax); - dax.sector = to_sector(bh, inode); - dax.size = bh->b_size; - map_len = dax_map_atomic(bdev, &dax); - if (map_len < 0) { - rc = map_len; - break; - } - dax.addr += first; - size = map_len - first; - } - /* - * pos + size is one past the last offset for IO, - * so pos + size can overflow loff_t at extreme offsets. - * Cast to u64 to catch this and get the true minimum. - */ - max = min_t(u64, pos + size, end); - } - - if (iov_iter_rw(iter) == WRITE) { - len = copy_from_iter_pmem(dax.addr, max - pos, iter); - } else if (!hole) - len = copy_to_iter((void __force *) dax.addr, max - pos, - iter); - else - len = iov_iter_zero(max - pos, iter); - - if (!len) { - rc = -EFAULT; - break; - } + long rc; - pos += len; - if (!IS_ERR(dax.addr)) - dax.addr += len; - } + if (!page) + return ERR_PTR(-ENOMEM); + rc = dax_map_atomic(bdev, &dax); + if (rc < 0) + return ERR_PTR(rc); + memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); dax_unmap_atomic(bdev, &dax); - - return (pos == start) ? rc : pos - start; -} - -/** - * dax_do_io - Perform I/O to a DAX file - * @iocb: The control block for this I/O - * @inode: The file which the I/O is directed at - * @iter: The addresses to do I/O from or to - * @get_block: The filesystem method used to translate file offsets to blocks - * @end_io: A filesystem callback for I/O completion - * @flags: See below - * - * This function uses the same locking scheme as do_blockdev_direct_IO: - * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the - * caller for writes. For reads, we take and release the i_mutex ourselves. - * If DIO_LOCKING is not set, the filesystem takes care of its own locking. - * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O - * is in progress. - */ -ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, - struct iov_iter *iter, get_block_t get_block, - dio_iodone_t end_io, int flags) -{ - struct buffer_head bh; - ssize_t retval = -EINVAL; - loff_t pos = iocb->ki_pos; - loff_t end = pos + iov_iter_count(iter); - - memset(&bh, 0, sizeof(bh)); - bh.b_bdev = inode->i_sb->s_bdev; - - if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) - inode_lock(inode); - - /* Protects against truncate */ - if (!(flags & DIO_SKIP_DIO_COUNT)) - inode_dio_begin(inode); - - retval = dax_io(inode, iter, pos, end, get_block, &bh); - - if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) - inode_unlock(inode); - - if (end_io) { - int err; - - err = end_io(iocb, pos, retval, bh.b_private); - if (err) - retval = err; - } - - if (!(flags & DIO_SKIP_DIO_COUNT)) - inode_dio_end(inode); - return retval; + return page; } -EXPORT_SYMBOL_GPL(dax_do_io); /* * DAX radix tree locking */ struct exceptional_entry_key { struct address_space *mapping; - unsigned long index; + pgoff_t entry_start; }; struct wait_exceptional_entry_queue { @@ -308,6 +129,26 @@ struct wait_exceptional_entry_queue { struct exceptional_entry_key key; }; +static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, + pgoff_t index, void *entry, struct exceptional_entry_key *key) +{ + unsigned long hash; + + /* + * If 'entry' is a PMD, align the 'index' that we use for the wait + * queue to the start of that PMD. This ensures that all offsets in + * the range covered by the PMD map to the same bit lock. + */ + if (dax_is_pmd_entry(entry)) + index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1); + + key->mapping = mapping; + key->entry_start = index; + + hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS); + return wait_table + hash; +} + static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, int sync, void *keyp) { @@ -316,7 +157,7 @@ static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, container_of(wait, struct wait_exceptional_entry_queue, wait); if (key->mapping != ewait->key.mapping || - key->index != ewait->key.index) + key->entry_start != ewait->key.entry_start) return 0; return autoremove_wake_function(wait, mode, sync, NULL); } @@ -372,24 +213,24 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot) static void *get_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void ***slotp) { - void *ret, **slot; + void *entry, **slot; struct wait_exceptional_entry_queue ewait; - wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); + wait_queue_head_t *wq; init_wait(&ewait.wait); ewait.wait.func = wake_exceptional_entry_func; - ewait.key.mapping = mapping; - ewait.key.index = index; for (;;) { - ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, + entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); - if (!ret || !radix_tree_exceptional_entry(ret) || + if (!entry || !radix_tree_exceptional_entry(entry) || !slot_locked(mapping, slot)) { if (slotp) *slotp = slot; - return ret; + return entry; } + + wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); spin_unlock_irq(&mapping->tree_lock); @@ -399,52 +240,156 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, } } +static void put_locked_mapping_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + if (!radix_tree_exceptional_entry(entry)) { + unlock_page(entry); + put_page(entry); + } else { + dax_unlock_mapping_entry(mapping, index); + } +} + +/* + * Called when we are done with radix tree entry we looked up via + * get_unlocked_mapping_entry() and which we didn't lock in the end. + */ +static void put_unlocked_mapping_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + if (!radix_tree_exceptional_entry(entry)) + return; + + /* We have to wake up next waiter for the radix tree entry lock */ + dax_wake_mapping_entry_waiter(mapping, index, entry, false); +} + /* * Find radix tree entry at given index. If it points to a page, return with * the page locked. If it points to the exceptional entry, return with the * radix tree entry locked. If the radix tree doesn't contain given index, * create empty exceptional entry for the index and return with it locked. * + * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will + * either return that locked entry or will return an error. This error will + * happen if there are any 4k entries (either zero pages or DAX entries) + * within the 2MiB range that we are requesting. + * + * We always favor 4k entries over 2MiB entries. There isn't a flow where we + * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB + * insertion will fail if it finds any 4k entries already in the tree, and a + * 4k insertion will cause an existing 2MiB entry to be unmapped and + * downgraded to 4k entries. This happens for both 2MiB huge zero pages as + * well as 2MiB empty entries. + * + * The exception to this downgrade path is for 2MiB DAX PMD entries that have + * real storage backing them. We will leave these real 2MiB DAX entries in + * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. + * * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For * persistent memory the benefit is doubtful. We can add that later if we can * show it helps. */ -static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index) +static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, + unsigned long size_flag) { - void *ret, **slot; + bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ + void *entry, **slot; restart: spin_lock_irq(&mapping->tree_lock); - ret = get_unlocked_mapping_entry(mapping, index, &slot); + entry = get_unlocked_mapping_entry(mapping, index, &slot); + + if (entry) { + if (size_flag & RADIX_DAX_PMD) { + if (!radix_tree_exceptional_entry(entry) || + dax_is_pte_entry(entry)) { + put_unlocked_mapping_entry(mapping, index, + entry); + entry = ERR_PTR(-EEXIST); + goto out_unlock; + } + } else { /* trying to grab a PTE entry */ + if (radix_tree_exceptional_entry(entry) && + dax_is_pmd_entry(entry) && + (dax_is_zero_entry(entry) || + dax_is_empty_entry(entry))) { + pmd_downgrade = true; + } + } + } + /* No entry for given index? Make sure radix tree is big enough. */ - if (!ret) { + if (!entry || pmd_downgrade) { int err; + if (pmd_downgrade) { + /* + * Make sure 'entry' remains valid while we drop + * mapping->tree_lock. + */ + entry = lock_slot(mapping, slot); + } + spin_unlock_irq(&mapping->tree_lock); + /* + * Besides huge zero pages the only other thing that gets + * downgraded are empty entries which don't need to be + * unmapped. + */ + if (pmd_downgrade && dax_is_zero_entry(entry)) + unmap_mapping_range(mapping, + (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); + err = radix_tree_preload( mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); - if (err) + if (err) { + if (pmd_downgrade) + put_locked_mapping_entry(mapping, index, entry); return ERR_PTR(err); - ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | - RADIX_DAX_ENTRY_LOCK); + } spin_lock_irq(&mapping->tree_lock); - err = radix_tree_insert(&mapping->page_tree, index, ret); + + if (pmd_downgrade) { + radix_tree_delete(&mapping->page_tree, index); + mapping->nrexceptional--; + dax_wake_mapping_entry_waiter(mapping, index, entry, + true); + } + + entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); + + err = __radix_tree_insert(&mapping->page_tree, index, + dax_radix_order(entry), entry); radix_tree_preload_end(); if (err) { spin_unlock_irq(&mapping->tree_lock); - /* Someone already created the entry? */ - if (err == -EEXIST) + /* + * Someone already created the entry? This is a + * normal failure when inserting PMDs in a range + * that already contains PTEs. In that case we want + * to return -EEXIST immediately. + */ + if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD)) goto restart; + /* + * Our insertion of a DAX PMD entry failed, most + * likely because it collided with a PTE sized entry + * at a different index in the PMD range. We haven't + * inserted anything into the radix tree and have no + * waiters to wake. + */ return ERR_PTR(err); } /* Good, we have inserted empty locked entry into the tree. */ mapping->nrexceptional++; spin_unlock_irq(&mapping->tree_lock); - return ret; + return entry; } /* Normal page in radix tree? */ - if (!radix_tree_exceptional_entry(ret)) { - struct page *page = ret; + if (!radix_tree_exceptional_entry(entry)) { + struct page *page = entry; get_page(page); spin_unlock_irq(&mapping->tree_lock); @@ -457,15 +402,26 @@ restart: } return page; } - ret = lock_slot(mapping, slot); + entry = lock_slot(mapping, slot); + out_unlock: spin_unlock_irq(&mapping->tree_lock); - return ret; + return entry; } +/* + * We do not necessarily hold the mapping->tree_lock when we call this + * function so it is possible that 'entry' is no longer a valid item in the + * radix tree. This is okay because all we really need to do is to find the + * correct waitqueue where tasks might be waiting for that old 'entry' and + * wake them. + */ void dax_wake_mapping_entry_waiter(struct address_space *mapping, - pgoff_t index, bool wake_all) + pgoff_t index, void *entry, bool wake_all) { - wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); + struct exceptional_entry_key key; + wait_queue_head_t *wq; + + wq = dax_entry_waitqueue(mapping, index, entry, &key); /* * Checking for locked entry and prepare_to_wait_exclusive() happens @@ -473,54 +429,24 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, * So at this point all tasks that could have seen our entry locked * must be in the waitqueue and the following check will see them. */ - if (waitqueue_active(wq)) { - struct exceptional_entry_key key; - - key.mapping = mapping; - key.index = index; + if (waitqueue_active(wq)) __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); - } } void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) { - void *ret, **slot; + void *entry, **slot; spin_lock_irq(&mapping->tree_lock); - ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); - if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) || + entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); + if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || !slot_locked(mapping, slot))) { spin_unlock_irq(&mapping->tree_lock); return; } unlock_slot(mapping, slot); spin_unlock_irq(&mapping->tree_lock); - dax_wake_mapping_entry_waiter(mapping, index, false); -} - -static void put_locked_mapping_entry(struct address_space *mapping, - pgoff_t index, void *entry) -{ - if (!radix_tree_exceptional_entry(entry)) { - unlock_page(entry); - put_page(entry); - } else { - dax_unlock_mapping_entry(mapping, index); - } -} - -/* - * Called when we are done with radix tree entry we looked up via - * get_unlocked_mapping_entry() and which we didn't lock in the end. - */ -static void put_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void *entry) -{ - if (!radix_tree_exceptional_entry(entry)) - return; - - /* We have to wake up next waiter for the radix tree entry lock */ - dax_wake_mapping_entry_waiter(mapping, index, false); + dax_wake_mapping_entry_waiter(mapping, index, entry, false); } /* @@ -547,7 +473,7 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) radix_tree_delete(&mapping->page_tree, index); mapping->nrexceptional--; spin_unlock_irq(&mapping->tree_lock); - dax_wake_mapping_entry_waiter(mapping, index, true); + dax_wake_mapping_entry_waiter(mapping, index, entry, true); return 1; } @@ -600,11 +526,17 @@ static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size return 0; } -#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) - +/* + * By this point grab_mapping_entry() has ensured that we have a locked entry + * of the appropriate size so we don't have to worry about downgrading PMDs to + * PTEs. If we happen to be trying to insert a PTE and there is a PMD + * already in the tree, we will skip the insertion and just dirty the PMD as + * appropriate. + */ static void *dax_insert_mapping_entry(struct address_space *mapping, struct vm_fault *vmf, - void *entry, sector_t sector) + void *entry, sector_t sector, + unsigned long flags) { struct radix_tree_root *page_tree = &mapping->page_tree; int error = 0; @@ -627,22 +559,35 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); if (error) return ERR_PTR(error); + } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) { + /* replacing huge zero page with PMD block mapping */ + unmap_mapping_range(mapping, + (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); } spin_lock_irq(&mapping->tree_lock); - new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) | - RADIX_DAX_ENTRY_LOCK); + new_entry = dax_radix_locked_entry(sector, flags); + if (hole_fill) { __delete_from_page_cache(entry, NULL); /* Drop pagecache reference */ put_page(entry); - error = radix_tree_insert(page_tree, index, new_entry); + error = __radix_tree_insert(page_tree, index, + dax_radix_order(new_entry), new_entry); if (error) { new_entry = ERR_PTR(error); goto unlock; } mapping->nrexceptional++; - } else { + } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { + /* + * Only swap our new entry into the radix tree if the current + * entry is a zero page or an empty entry. If a normal PTE or + * PMD entry is already in the tree, we leave it alone. This + * means that if we are trying to insert a PTE and the + * existing entry is a PMD, we will just leave the PMD in the + * tree and dirty it if necessary. + */ struct radix_tree_node *node; void **slot; void *ret; @@ -674,7 +619,6 @@ static int dax_writeback_one(struct block_device *bdev, struct address_space *mapping, pgoff_t index, void *entry) { struct radix_tree_root *page_tree = &mapping->page_tree; - int type = RADIX_DAX_TYPE(entry); struct radix_tree_node *node; struct blk_dax_ctl dax; void **slot; @@ -695,13 +639,21 @@ static int dax_writeback_one(struct block_device *bdev, if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) goto unlock; - if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { + if (WARN_ON_ONCE(dax_is_empty_entry(entry) || + dax_is_zero_entry(entry))) { ret = -EIO; goto unlock; } - dax.sector = RADIX_DAX_SECTOR(entry); - dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); + /* + * Even if dax_writeback_mapping_range() was given a wbc->range_start + * in the middle of a PMD, the 'index' we are given will be aligned to + * the start index of the PMD, as will the sector we pull from + * 'entry'. This allows us to flush for PMD_SIZE and not have to + * worry about partial PMD writebacks. + */ + dax.sector = dax_radix_sector(entry); + dax.size = PAGE_SIZE << dax_radix_order(entry); spin_unlock_irq(&mapping->tree_lock); /* @@ -740,12 +692,11 @@ int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc) { struct inode *inode = mapping->host; - pgoff_t start_index, end_index, pmd_index; + pgoff_t start_index, end_index; pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; bool done = false; int i, ret = 0; - void *entry; if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; @@ -755,15 +706,6 @@ int dax_writeback_mapping_range(struct address_space *mapping, start_index = wbc->range_start >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT; - pmd_index = DAX_PMD_INDEX(start_index); - - rcu_read_lock(); - entry = radix_tree_lookup(&mapping->page_tree, pmd_index); - rcu_read_unlock(); - - /* see if the start of our range is covered by a PMD entry */ - if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) - start_index = pmd_index; tag_pages_for_writeback(mapping, start_index, end_index); @@ -808,7 +750,7 @@ static int dax_insert_mapping(struct address_space *mapping, return PTR_ERR(dax.addr); dax_unmap_atomic(bdev, &dax); - ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector); + ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0); if (IS_ERR(ret)) return PTR_ERR(ret); *entryp = ret; @@ -817,323 +759,6 @@ static int dax_insert_mapping(struct address_space *mapping, } /** - * dax_fault - handle a page fault on a DAX file - * @vma: The virtual memory area where the fault occurred - * @vmf: The description of the fault - * @get_block: The filesystem method used to translate file offsets to blocks - * - * When a page fault occurs, filesystems may call this helper in their - * fault handler for DAX files. dax_fault() assumes the caller has done all - * the necessary locking for the page fault to proceed successfully. - */ -int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) -{ - struct file *file = vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - void *entry; - struct buffer_head bh; - unsigned long vaddr = (unsigned long)vmf->virtual_address; - unsigned blkbits = inode->i_blkbits; - sector_t block; - pgoff_t size; - int error; - int major = 0; - - /* - * Check whether offset isn't beyond end of file now. Caller is supposed - * to hold locks serializing us with truncate / punch hole so this is - * a reliable test. - */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - return VM_FAULT_SIGBUS; - - memset(&bh, 0, sizeof(bh)); - block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); - bh.b_bdev = inode->i_sb->s_bdev; - bh.b_size = PAGE_SIZE; - - entry = grab_mapping_entry(mapping, vmf->pgoff); - if (IS_ERR(entry)) { - error = PTR_ERR(entry); - goto out; - } - - error = get_block(inode, block, &bh, 0); - if (!error && (bh.b_size < PAGE_SIZE)) - error = -EIO; /* fs corruption? */ - if (error) - goto unlock_entry; - - if (vmf->cow_page) { - struct page *new_page = vmf->cow_page; - if (buffer_written(&bh)) - error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode), - bh.b_size, new_page, vaddr); - else - clear_user_highpage(new_page, vaddr); - if (error) - goto unlock_entry; - if (!radix_tree_exceptional_entry(entry)) { - vmf->page = entry; - return VM_FAULT_LOCKED; - } - vmf->entry = entry; - return VM_FAULT_DAX_LOCKED; - } - - if (!buffer_mapped(&bh)) { - if (vmf->flags & FAULT_FLAG_WRITE) { - error = get_block(inode, block, &bh, 1); - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - major = VM_FAULT_MAJOR; - if (!error && (bh.b_size < PAGE_SIZE)) - error = -EIO; - if (error) - goto unlock_entry; - } else { - return dax_load_hole(mapping, entry, vmf); - } - } - - /* Filesystem should not return unwritten buffers to us! */ - WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); - error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode), - bh.b_size, &entry, vma, vmf); - unlock_entry: - put_locked_mapping_entry(mapping, vmf->pgoff, entry); - out: - if (error == -ENOMEM) - return VM_FAULT_OOM | major; - /* -EBUSY is fine, somebody else faulted on the same PTE */ - if ((error < 0) && (error != -EBUSY)) - return VM_FAULT_SIGBUS | major; - return VM_FAULT_NOPAGE | major; -} -EXPORT_SYMBOL_GPL(dax_fault); - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) -/* - * The 'colour' (ie low bits) within a PMD of a page offset. This comes up - * more often than one might expect in the below function. - */ -#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) - -static void __dax_dbg(struct buffer_head *bh, unsigned long address, - const char *reason, const char *fn) -{ - if (bh) { - char bname[BDEVNAME_SIZE]; - bdevname(bh->b_bdev, bname); - pr_debug("%s: %s addr: %lx dev %s state %lx start %lld " - "length %zd fallback: %s\n", fn, current->comm, - address, bname, bh->b_state, (u64)bh->b_blocknr, - bh->b_size, reason); - } else { - pr_debug("%s: %s addr: %lx fallback: %s\n", fn, - current->comm, address, reason); - } -} - -#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") - -/** - * dax_pmd_fault - handle a PMD fault on a DAX file - * @vma: The virtual memory area where the fault occurred - * @vmf: The description of the fault - * @get_block: The filesystem method used to translate file offsets to blocks - * - * When a page fault occurs, filesystems may call this helper in their - * pmd_fault handler for DAX files. - */ -int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, get_block_t get_block) -{ - struct file *file = vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - struct buffer_head bh; - unsigned blkbits = inode->i_blkbits; - unsigned long pmd_addr = address & PMD_MASK; - bool write = flags & FAULT_FLAG_WRITE; - struct block_device *bdev; - pgoff_t size, pgoff; - sector_t block; - int result = 0; - bool alloc = false; - - /* dax pmd mappings require pfn_t_devmap() */ - if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) - return VM_FAULT_FALLBACK; - - /* Fall back to PTEs if we're going to COW */ - if (write && !(vma->vm_flags & VM_SHARED)) { - split_huge_pmd(vma, pmd, address); - dax_pmd_dbg(NULL, address, "cow write"); - return VM_FAULT_FALLBACK; - } - /* If the PMD would extend outside the VMA */ - if (pmd_addr < vma->vm_start) { - dax_pmd_dbg(NULL, address, "vma start unaligned"); - return VM_FAULT_FALLBACK; - } - if ((pmd_addr + PMD_SIZE) > vma->vm_end) { - dax_pmd_dbg(NULL, address, "vma end unaligned"); - return VM_FAULT_FALLBACK; - } - - pgoff = linear_page_index(vma, pmd_addr); - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (pgoff >= size) - return VM_FAULT_SIGBUS; - /* If the PMD would cover blocks out of the file */ - if ((pgoff | PG_PMD_COLOUR) >= size) { - dax_pmd_dbg(NULL, address, - "offset + huge page size > file size"); - return VM_FAULT_FALLBACK; - } - - memset(&bh, 0, sizeof(bh)); - bh.b_bdev = inode->i_sb->s_bdev; - block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); - - bh.b_size = PMD_SIZE; - - if (get_block(inode, block, &bh, 0) != 0) - return VM_FAULT_SIGBUS; - - if (!buffer_mapped(&bh) && write) { - if (get_block(inode, block, &bh, 1) != 0) - return VM_FAULT_SIGBUS; - alloc = true; - WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); - } - - bdev = bh.b_bdev; - - /* - * If the filesystem isn't willing to tell us the length of a hole, - * just fall back to PTEs. Calling get_block 512 times in a loop - * would be silly. - */ - if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { - dax_pmd_dbg(&bh, address, "allocated block too small"); - return VM_FAULT_FALLBACK; - } - - /* - * If we allocated new storage, make sure no process has any - * zero pages covering this hole - */ - if (alloc) { - loff_t lstart = pgoff << PAGE_SHIFT; - loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ - - truncate_pagecache_range(inode, lstart, lend); - } - - if (!write && !buffer_mapped(&bh)) { - spinlock_t *ptl; - pmd_t entry; - struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm); - - if (unlikely(!zero_page)) { - dax_pmd_dbg(&bh, address, "no zero page"); - goto fallback; - } - - ptl = pmd_lock(vma->vm_mm, pmd); - if (!pmd_none(*pmd)) { - spin_unlock(ptl); - dax_pmd_dbg(&bh, address, "pmd already present"); - goto fallback; - } - - dev_dbg(part_to_dev(bdev->bd_part), - "%s: %s addr: %lx pfn: <zero> sect: %llx\n", - __func__, current->comm, address, - (unsigned long long) to_sector(&bh, inode)); - - entry = mk_pmd(zero_page, vma->vm_page_prot); - entry = pmd_mkhuge(entry); - set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); - result = VM_FAULT_NOPAGE; - spin_unlock(ptl); - } else { - struct blk_dax_ctl dax = { - .sector = to_sector(&bh, inode), - .size = PMD_SIZE, - }; - long length = dax_map_atomic(bdev, &dax); - - if (length < 0) { - dax_pmd_dbg(&bh, address, "dax-error fallback"); - goto fallback; - } - if (length < PMD_SIZE) { - dax_pmd_dbg(&bh, address, "dax-length too small"); - dax_unmap_atomic(bdev, &dax); - goto fallback; - } - if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { - dax_pmd_dbg(&bh, address, "pfn unaligned"); - dax_unmap_atomic(bdev, &dax); - goto fallback; - } - - if (!pfn_t_devmap(dax.pfn)) { - dax_unmap_atomic(bdev, &dax); - dax_pmd_dbg(&bh, address, "pfn not in memmap"); - goto fallback; - } - dax_unmap_atomic(bdev, &dax); - - /* - * For PTE faults we insert a radix tree entry for reads, and - * leave it clean. Then on the first write we dirty the radix - * tree entry via the dax_pfn_mkwrite() path. This sequence - * allows the dax_pfn_mkwrite() call to be simpler and avoid a - * call into get_block() to translate the pgoff to a sector in - * order to be able to create a new radix tree entry. - * - * The PMD path doesn't have an equivalent to - * dax_pfn_mkwrite(), though, so for a read followed by a - * write we traverse all the way through dax_pmd_fault() - * twice. This means we can just skip inserting a radix tree - * entry completely on the initial read and just wait until - * the write to insert a dirty entry. - */ - if (write) { - /* - * We should insert radix-tree entry and dirty it here. - * For now this is broken... - */ - } - - dev_dbg(part_to_dev(bdev->bd_part), - "%s: %s addr: %lx pfn: %lx sect: %llx\n", - __func__, current->comm, address, - pfn_t_to_pfn(dax.pfn), - (unsigned long long) dax.sector); - result |= vmf_insert_pfn_pmd(vma, address, pmd, - dax.pfn, write); - } - - out: - return result; - - fallback: - count_vm_event(THP_FAULT_FALLBACK); - result = VM_FAULT_FALLBACK; - goto out; -} -EXPORT_SYMBOL_GPL(dax_pmd_fault); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -/** * dax_pfn_mkwrite - handle first write to DAX page * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault @@ -1193,62 +818,14 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(__dax_zero_page_range); -/** - * dax_zero_page_range - zero a range within a page of a DAX file - * @inode: The file being truncated - * @from: The file offset that is being truncated to - * @length: The number of bytes to zero - * @get_block: The filesystem method used to translate file offsets to blocks - * - * This function can be called by a filesystem when it is zeroing part of a - * page in a DAX file. This is intended for hole-punch operations. If - * you are truncating a file, the helper function dax_truncate_page() may be - * more convenient. - */ -int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, - get_block_t get_block) -{ - struct buffer_head bh; - pgoff_t index = from >> PAGE_SHIFT; - unsigned offset = from & (PAGE_SIZE-1); - int err; - - /* Block boundary? Nothing to do */ - if (!length) - return 0; - BUG_ON((offset + length) > PAGE_SIZE); - - memset(&bh, 0, sizeof(bh)); - bh.b_bdev = inode->i_sb->s_bdev; - bh.b_size = PAGE_SIZE; - err = get_block(inode, index, &bh, 0); - if (err < 0 || !buffer_written(&bh)) - return err; - - return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode), - offset, length); -} -EXPORT_SYMBOL_GPL(dax_zero_page_range); - -/** - * dax_truncate_page - handle a partial page being truncated in a DAX file - * @inode: The file being truncated - * @from: The file offset that is being truncated to - * @get_block: The filesystem method used to translate file offsets to blocks - * - * Similar to block_truncate_page(), this function can be called by a - * filesystem when it is truncating a DAX file to handle the partial page. - */ -int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) +#ifdef CONFIG_FS_IOMAP +static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) { - unsigned length = PAGE_ALIGN(from) - from; - return dax_zero_page_range(inode, from, length, get_block); + return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); } -EXPORT_SYMBOL_GPL(dax_truncate_page); -#ifdef CONFIG_FS_IOMAP static loff_t -iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, +dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap) { struct iov_iter *iter = data; @@ -1272,8 +849,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct blk_dax_ctl dax = { 0 }; ssize_t map_len; - dax.sector = iomap->blkno + - (((pos & PAGE_MASK) - iomap->offset) >> 9); + dax.sector = dax_iomap_sector(iomap, pos); dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; map_len = dax_map_atomic(iomap->bdev, &dax); if (map_len < 0) { @@ -1305,7 +881,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, } /** - * iomap_dax_rw - Perform I/O to a DAX file + * dax_iomap_rw - Perform I/O to a DAX file * @iocb: The control block for this I/O * @iter: The addresses to do I/O from or to * @ops: iomap ops passed from the file system @@ -1315,7 +891,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, * and evicting any page cache pages in the region under I/O. */ ssize_t -iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, +dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops) { struct address_space *mapping = iocb->ki_filp->f_mapping; @@ -1345,7 +921,7 @@ iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, while (iov_iter_count(iter)) { ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, - iter, iomap_dax_actor); + iter, dax_iomap_actor); if (ret <= 0) break; pos += ret; @@ -1355,10 +931,10 @@ iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, iocb->ki_pos += done; return done ? done : ret; } -EXPORT_SYMBOL_GPL(iomap_dax_rw); +EXPORT_SYMBOL_GPL(dax_iomap_rw); /** - * iomap_dax_fault - handle a page fault on a DAX file + * dax_iomap_fault - handle a page fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @ops: iomap ops passed from the file system @@ -1367,7 +943,7 @@ EXPORT_SYMBOL_GPL(iomap_dax_rw); * or mkwrite handler for DAX files. Assumes the caller has done all the * necessary locking for the page fault to proceed successfully. */ -int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, +int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, struct iomap_ops *ops) { struct address_space *mapping = vma->vm_file->f_mapping; @@ -1376,8 +952,9 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; sector_t sector; struct iomap iomap = { 0 }; - unsigned flags = 0; + unsigned flags = IOMAP_FAULT; int error, major = 0; + int locked_status = 0; void *entry; /* @@ -1388,7 +965,7 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (pos >= i_size_read(inode)) return VM_FAULT_SIGBUS; - entry = grab_mapping_entry(mapping, vmf->pgoff); + entry = grab_mapping_entry(mapping, vmf->pgoff, 0); if (IS_ERR(entry)) { error = PTR_ERR(entry); goto out; @@ -1407,10 +984,10 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, goto unlock_entry; if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { error = -EIO; /* fs corruption? */ - goto unlock_entry; + goto finish_iomap; } - sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9); + sector = dax_iomap_sector(&iomap, pos); if (vmf->cow_page) { switch (iomap.type) { @@ -1429,13 +1006,15 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } if (error) - goto unlock_entry; + goto finish_iomap; if (!radix_tree_exceptional_entry(entry)) { vmf->page = entry; - return VM_FAULT_LOCKED; + locked_status = VM_FAULT_LOCKED; + } else { + vmf->entry = entry; + locked_status = VM_FAULT_DAX_LOCKED; } - vmf->entry = entry; - return VM_FAULT_DAX_LOCKED; + goto finish_iomap; } switch (iomap.type) { @@ -1450,8 +1029,10 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: - if (!(vmf->flags & FAULT_FLAG_WRITE)) - return dax_load_hole(mapping, entry, vmf); + if (!(vmf->flags & FAULT_FLAG_WRITE)) { + locked_status = dax_load_hole(mapping, entry, vmf); + break; + } /*FALLTHRU*/ default: WARN_ON_ONCE(1); @@ -1459,15 +1040,218 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, break; } + finish_iomap: + if (ops->iomap_end) { + if (error) { + /* keep previous error */ + ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, + &iomap); + } else { + error = ops->iomap_end(inode, pos, PAGE_SIZE, + PAGE_SIZE, flags, &iomap); + } + } unlock_entry: - put_locked_mapping_entry(mapping, vmf->pgoff, entry); + if (!locked_status || error) + put_locked_mapping_entry(mapping, vmf->pgoff, entry); out: if (error == -ENOMEM) return VM_FAULT_OOM | major; /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error < 0 && error != -EBUSY) return VM_FAULT_SIGBUS | major; + if (locked_status) { + WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ + return locked_status; + } return VM_FAULT_NOPAGE | major; } -EXPORT_SYMBOL_GPL(iomap_dax_fault); +EXPORT_SYMBOL_GPL(dax_iomap_fault); + +#ifdef CONFIG_FS_DAX_PMD +/* + * The 'colour' (ie low bits) within a PMD of a page offset. This comes up + * more often than one might expect in the below functions. + */ +#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) + +static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd, + struct vm_fault *vmf, unsigned long address, + struct iomap *iomap, loff_t pos, bool write, void **entryp) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + struct block_device *bdev = iomap->bdev; + struct blk_dax_ctl dax = { + .sector = dax_iomap_sector(iomap, pos), + .size = PMD_SIZE, + }; + long length = dax_map_atomic(bdev, &dax); + void *ret; + + if (length < 0) /* dax_map_atomic() failed */ + return VM_FAULT_FALLBACK; + if (length < PMD_SIZE) + goto unmap_fallback; + if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) + goto unmap_fallback; + if (!pfn_t_devmap(dax.pfn)) + goto unmap_fallback; + + dax_unmap_atomic(bdev, &dax); + + ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector, + RADIX_DAX_PMD); + if (IS_ERR(ret)) + return VM_FAULT_FALLBACK; + *entryp = ret; + + return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write); + + unmap_fallback: + dax_unmap_atomic(bdev, &dax); + return VM_FAULT_FALLBACK; +} + +static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd, + struct vm_fault *vmf, unsigned long address, + struct iomap *iomap, void **entryp) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + unsigned long pmd_addr = address & PMD_MASK; + struct page *zero_page; + spinlock_t *ptl; + pmd_t pmd_entry; + void *ret; + + zero_page = mm_get_huge_zero_page(vma->vm_mm); + + if (unlikely(!zero_page)) + return VM_FAULT_FALLBACK; + + ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, + RADIX_DAX_PMD | RADIX_DAX_HZP); + if (IS_ERR(ret)) + return VM_FAULT_FALLBACK; + *entryp = ret; + + ptl = pmd_lock(vma->vm_mm, pmd); + if (!pmd_none(*pmd)) { + spin_unlock(ptl); + return VM_FAULT_FALLBACK; + } + + pmd_entry = mk_pmd(zero_page, vma->vm_page_prot); + pmd_entry = pmd_mkhuge(pmd_entry); + set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry); + spin_unlock(ptl); + return VM_FAULT_NOPAGE; +} + +int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, unsigned int flags, struct iomap_ops *ops) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + unsigned long pmd_addr = address & PMD_MASK; + bool write = flags & FAULT_FLAG_WRITE; + unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; + struct inode *inode = mapping->host; + int result = VM_FAULT_FALLBACK; + struct iomap iomap = { 0 }; + pgoff_t max_pgoff, pgoff; + struct vm_fault vmf; + void *entry; + loff_t pos; + int error; + + /* Fall back to PTEs if we're going to COW */ + if (write && !(vma->vm_flags & VM_SHARED)) + goto fallback; + + /* If the PMD would extend outside the VMA */ + if (pmd_addr < vma->vm_start) + goto fallback; + if ((pmd_addr + PMD_SIZE) > vma->vm_end) + goto fallback; + + /* + * Check whether offset isn't beyond end of file now. Caller is + * supposed to hold locks serializing us with truncate / punch hole so + * this is a reliable test. + */ + pgoff = linear_page_index(vma, pmd_addr); + max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; + + if (pgoff > max_pgoff) + return VM_FAULT_SIGBUS; + + /* If the PMD would extend beyond the file size */ + if ((pgoff | PG_PMD_COLOUR) > max_pgoff) + goto fallback; + + /* + * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX + * PMD or a HZP entry. If it can't (because a 4k page is already in + * the tree, for instance), it will return -EEXIST and we just fall + * back to 4k entries. + */ + entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); + if (IS_ERR(entry)) + goto fallback; + + /* + * Note that we don't use iomap_apply here. We aren't doing I/O, only + * setting up a mapping, so really we're using iomap_begin() as a way + * to look up our filesystem block. + */ + pos = (loff_t)pgoff << PAGE_SHIFT; + error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); + if (error) + goto unlock_entry; + if (iomap.offset + iomap.length < pos + PMD_SIZE) + goto finish_iomap; + + vmf.pgoff = pgoff; + vmf.flags = flags; + vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; + + switch (iomap.type) { + case IOMAP_MAPPED: + result = dax_pmd_insert_mapping(vma, pmd, &vmf, address, + &iomap, pos, write, &entry); + break; + case IOMAP_UNWRITTEN: + case IOMAP_HOLE: + if (WARN_ON_ONCE(write)) + goto finish_iomap; + result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, + &entry); + break; + default: + WARN_ON_ONCE(1); + break; + } + + finish_iomap: + if (ops->iomap_end) { + if (result == VM_FAULT_FALLBACK) { + ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags, + &iomap); + } else { + error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE, + iomap_flags, &iomap); + if (error) + result = VM_FAULT_FALLBACK; + } + } + unlock_entry: + put_locked_mapping_entry(mapping, pgoff, entry); + fallback: + if (result == VM_FAULT_FALLBACK) { + split_huge_pmd(vma, pmd, address); + count_vm_event(THP_FAULT_FALLBACK); + } + return result; +} +EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault); +#endif /* CONFIG_FS_DAX_PMD */ #endif /* CONFIG_FS_IOMAP */ diff --git a/fs/ext2/file.c b/fs/ext2/file.c index a0e1478dfd04..b0f241528a30 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -38,7 +38,7 @@ static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) return 0; /* skip atime */ inode_lock_shared(inode); - ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops); + ret = dax_iomap_rw(iocb, to, &ext2_iomap_ops); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); @@ -62,7 +62,7 @@ static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret) goto out_unlock; - ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops); + ret = dax_iomap_rw(iocb, from, &ext2_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); mark_inode_dirty(inode); @@ -99,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops); + ret = dax_iomap_fault(vma, vmf, &ext2_iomap_ops); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) @@ -107,27 +107,6 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return ret; } -static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned int flags) -{ - struct inode *inode = file_inode(vma->vm_file); - struct ext2_inode_info *ei = EXT2_I(inode); - int ret; - - if (flags & FAULT_FLAG_WRITE) { - sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); - } - down_read(&ei->dax_sem); - - ret = dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block); - - up_read(&ei->dax_sem); - if (flags & FAULT_FLAG_WRITE) - sb_end_pagefault(inode->i_sb); - return ret; -} - static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -154,7 +133,11 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, - .pmd_fault = ext2_dax_pmd_fault, + /* + * .pmd_fault is not supported for DAX because allocation in ext2 + * cannot be reliably aligned to huge page sizes and so pmd faults + * will always fail and fail back to regular faults. + */ .page_mkwrite = ext2_dax_fault, .pfn_mkwrite = ext2_dax_pfn_mkwrite, }; @@ -166,7 +149,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); vma->vm_ops = &ext2_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + vma->vm_flags |= VM_MIXEDMAP; return 0; } #else diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 41b8b44a391c..046b642f3585 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -850,6 +850,9 @@ struct iomap_ops ext2_iomap_ops = { .iomap_begin = ext2_iomap_begin, .iomap_end = ext2_iomap_end, }; +#else +/* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */ +struct iomap_ops ext2_iomap_ops; #endif /* CONFIG_FS_DAX */ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -1293,9 +1296,11 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) inode_dio_wait(inode); - if (IS_DAX(inode)) - error = dax_truncate_page(inode, newsize, ext2_get_block); - else if (test_opt(inode->i_sb, NOBH)) + if (IS_DAX(inode)) { + error = iomap_zero_range(inode, newsize, + PAGE_ALIGN(newsize) - newsize, NULL, + &ext2_iomap_ops); + } else if (test_opt(inode->i_sb, NOBH)) error = nobh_truncate_page(inode->i_mapping, newsize, ext2_get_block); else diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index e38039fd96ff..7b90691e98c4 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -37,6 +37,7 @@ config EXT4_FS select CRC16 select CRYPTO select CRYPTO_CRC32C + select FS_IOMAP if FS_DAX help This is the next generation of the ext3 filesystem. diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index dfa519979038..fd389935ecd1 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -196,7 +196,7 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, error = posix_acl_update_mode(inode, &inode->i_mode, &acl); if (error) return error; - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); } break; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a8a750f59621..2163c1e69f2a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -397,8 +397,9 @@ struct flex_groups { #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x204380FF /* User modifiable flags */ +#define EXT4_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */ +/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ #define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ EXT4_IMMUTABLE_FL | \ EXT4_APPEND_FL | \ @@ -1533,12 +1534,6 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode) return container_of(inode, struct ext4_inode_info, vfs_inode); } -static inline struct timespec ext4_current_time(struct inode *inode) -{ - return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? - current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; -} - static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || @@ -2277,11 +2272,6 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); -static inline int ext4_sb_has_crypto(struct super_block *sb) -{ - return ext4_has_feature_encrypt(sb); -} - static inline bool ext4_encrypted_inode(struct inode *inode) { return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT); @@ -2339,8 +2329,8 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } #define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page #define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page #define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range -#define fscrypt_process_policy fscrypt_notsupp_process_policy -#define fscrypt_get_policy fscrypt_notsupp_get_policy +#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy +#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy #define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context #define fscrypt_inherit_context fscrypt_notsupp_inherit_context #define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info @@ -2458,8 +2448,6 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_dax_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_dio_get_block(struct inode *inode, sector_t iblock, @@ -2492,7 +2480,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); -extern void ext4_truncate(struct inode *); +extern int ext4_truncate(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); @@ -3129,7 +3117,7 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern void ext4_ext_truncate(handle_t *, struct inode *); +extern int ext4_ext_truncate(handle_t *, struct inode *); extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); extern void ext4_ext_init(struct super_block *); @@ -3265,12 +3253,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } } -static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len) -{ - int blksize = 1 << inode->i_blkbits; - - return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize); -} +extern struct iomap_ops ext4_iomap_ops; #endif /* __KERNEL__ */ diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index b1d52c14098e..f97611171023 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -414,17 +414,19 @@ static inline int ext4_inode_journal_mode(struct inode *inode) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ /* We do not support data journalling with delayed allocation */ if (!S_ISREG(inode->i_mode) || - test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && - !test_opt(inode->i_sb, DELALLOC)) + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && + !test_opt(inode->i_sb, DELALLOC))) { + /* We do not support data journalling for encrypted data */ + if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode)) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + } if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ - else - BUG(); + BUG(); } static inline int ext4_should_journal_data(struct inode *inode) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c930a0110fb4..3e1014fe835e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4631,7 +4631,7 @@ out2: return err ? err : allocated; } -void ext4_ext_truncate(handle_t *handle, struct inode *inode) +int ext4_ext_truncate(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; ext4_lblk_t last_block; @@ -4645,7 +4645,9 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode) /* we have to know where to truncate from in crash case */ EXT4_I(inode)->i_disksize = inode->i_size; - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); + if (err) + return err; last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); @@ -4657,12 +4659,9 @@ retry: congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } - if (err) { - ext4_std_error(inode->i_sb, err); - return; - } - err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); - ext4_std_error(inode->i_sb, err); + if (err) + return err; + return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); } static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, @@ -4701,7 +4700,7 @@ retry: /* * Recalculate credits when extent tree depth changes. */ - if (depth >= 0 && depth != ext_depth(inode)) { + if (depth != ext_depth(inode)) { credits = ext4_chunk_trans_blocks(inode, len); depth = ext_depth(inode); } @@ -4725,7 +4724,7 @@ retry: map.m_lblk += ret; map.m_len = len = len - ret; epos = (loff_t)map.m_lblk << inode->i_blkbits; - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); if (new_size) { if (epos > new_size) epos = new_size; @@ -4853,7 +4852,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, } /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); @@ -4878,7 +4877,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, goto out_dio; } - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); if (new_size) { ext4_update_inode_size(inode, new_size); } else { @@ -5568,7 +5567,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: @@ -5678,7 +5677,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ret = ext4_mark_inode_dirty(handle, inode); if (ret) goto out_stop; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2a822d30e73f..b5f184493c57 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -31,6 +31,42 @@ #include "xattr.h" #include "acl.h" +#ifdef CONFIG_FS_DAX +static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + inode_lock_shared(inode); + /* + * Recheck under inode lock - at this point we are sure it cannot + * change anymore + */ + if (!IS_DAX(inode)) { + inode_unlock_shared(inode); + /* Fallback to buffered IO in case we cannot support DAX */ + return generic_file_read_iter(iocb, to); + } + ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); + return ret; +} +#endif + +static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + if (!iov_iter_count(to)) + return 0; /* skip atime */ + +#ifdef CONFIG_FS_DAX + if (IS_DAX(file_inode(iocb->ki_filp))) + return ext4_dax_read_iter(iocb, to); +#endif + return generic_file_read_iter(iocb, to); +} + /* * Called when an inode is released. Note that this is different * from ext4_file_open: open gets called at every open, but release @@ -88,6 +124,86 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) return 0; } +/* Is IO overwriting allocated and initialized blocks? */ +static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) +{ + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + int err, blklen; + + if (pos + len > i_size_read(inode)) + return false; + + map.m_lblk = pos >> blkbits; + map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits); + blklen = map.m_len; + + err = ext4_map_blocks(NULL, inode, &map, 0); + /* + * 'err==len' means that all of the blocks have been preallocated, + * regardless of whether they have been initialized or not. To exclude + * unwritten extents, we need to check m_flags. + */ + return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); +} + +static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + return ret; + /* + * If we have encountered a bitmap-format file, the size limit + * is smaller than s_maxbytes, which is for extent-mapped files. + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) + return -EFBIG; + iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); + } + return iov_iter_count(from); +} + +#ifdef CONFIG_FS_DAX +static ssize_t +ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + bool overwrite = false; + + inode_lock(inode); + ret = ext4_write_checks(iocb, from); + if (ret <= 0) + goto out; + ret = file_remove_privs(iocb->ki_filp); + if (ret) + goto out; + ret = file_update_time(iocb->ki_filp); + if (ret) + goto out; + + if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { + overwrite = true; + downgrade_write(&inode->i_rwsem); + } + ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); +out: + if (!overwrite) + inode_unlock(inode); + else + inode_unlock_shared(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} +#endif + static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { @@ -97,8 +213,13 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) int overwrite = 0; ssize_t ret; +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + return ext4_dax_write_iter(iocb, from); +#endif + inode_lock(inode); - ret = generic_write_checks(iocb, from); + ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; @@ -114,53 +235,11 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ext4_unwritten_wait(inode); } - /* - * If we have encountered a bitmap-format file, the size limit - * is smaller than s_maxbytes, which is for extent-mapped files. - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - - if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) { - ret = -EFBIG; - goto out; - } - iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); - } - iocb->private = &overwrite; - if (o_direct) { - size_t length = iov_iter_count(from); - loff_t pos = iocb->ki_pos; - - /* check whether we do a DIO overwrite or not */ - if (ext4_should_dioread_nolock(inode) && !unaligned_aio && - pos + length <= i_size_read(inode)) { - struct ext4_map_blocks map; - unsigned int blkbits = inode->i_blkbits; - int err, len; - - map.m_lblk = pos >> blkbits; - map.m_len = EXT4_MAX_BLOCKS(length, pos, blkbits); - len = map.m_len; - - err = ext4_map_blocks(NULL, inode, &map, 0); - /* - * 'err==len' means that all of blocks has - * been preallocated no matter they are - * initialized or not. For excluding - * unwritten extents, we need to check - * m_flags. There are two conditions that - * indicate for initialized extents. 1) If we - * hit extent cache, EXT4_MAP_MAPPED flag is - * returned; 2) If we do a real lookup, - * non-flags are returned. So we should check - * these two conditions. - */ - if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) - overwrite = 1; - } - } + /* Check whether we do a DIO overwrite or not */ + if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio && + ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) + overwrite = 1; ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); @@ -196,7 +275,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else - result = dax_fault(vma, vmf, ext4_dax_get_block); + result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops); if (write) { if (!IS_ERR(handle)) @@ -230,9 +309,10 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; - else - result = dax_pmd_fault(vma, addr, pmd, flags, - ext4_dax_get_block); + else { + result = dax_iomap_pmd_fault(vma, addr, pmd, flags, + &ext4_iomap_ops); + } if (write) { if (!IS_ERR(handle)) @@ -687,7 +767,7 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, - .read_iter = generic_file_read_iter, + .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 170421edfdfe..e57e8d90ea54 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1039,7 +1039,7 @@ got: /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = - ext4_current_time(inode); + current_time(inode); memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_dir_start_lookup = 0; @@ -1115,8 +1115,7 @@ got: } if (encrypt) { - /* give pointer to avoid set_context with journal ops. */ - err = fscrypt_inherit_context(dir, inode, &encrypt, true); + err = fscrypt_inherit_context(dir, inode, handle, true); if (err) goto fail_free_drop; } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index f74d5ee2cdec..437df6a1a841 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -299,6 +299,11 @@ static int ext4_create_inline_data(handle_t *handle, EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); + /* + * Propagate changes to inode->i_flags as well - e.g. S_DAX may + * get cleared + */ + ext4_set_inode_flags(inode); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); @@ -336,8 +341,10 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, len -= EXT4_MIN_INLINE_DATA_SIZE; value = kzalloc(len, GFP_NOFS); - if (!value) + if (!value) { + error = -ENOMEM; goto out; + } error = ext4_xattr_ibody_get(inode, i.name_index, i.name, value, len); @@ -442,6 +449,11 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, } } ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); + /* + * Propagate changes to inode->i_flags as well - e.g. S_DAX may + * get set. + */ + ext4_set_inode_flags(inode); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); @@ -1028,7 +1040,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); dir->i_version++; ext4_mark_inode_dirty(handle, dir); @@ -1971,7 +1983,7 @@ out: if (inode->i_nlink) ext4_orphan_del(handle, inode); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9c064727ed62..72d593fa690d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,6 +37,7 @@ #include <linux/printk.h> #include <linux/slab.h> #include <linux/bitops.h> +#include <linux/iomap.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -71,10 +72,9 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, - EXT4_INODE_SIZE(inode->i_sb) - - offset); } + csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + EXT4_INODE_SIZE(inode->i_sb) - offset); } return csum; @@ -261,8 +261,15 @@ void ext4_evict_inode(struct inode *inode) "couldn't mark inode dirty (err %d)", err); goto stop_handle; } - if (inode->i_blocks) - ext4_truncate(inode); + if (inode->i_blocks) { + err = ext4_truncate(inode); + if (err) { + ext4_error(inode->i_sb, + "couldn't truncate inode %lu (err %d)", + inode->i_ino, err); + goto stop_handle; + } + } /* * ext4_ext_truncate() doesn't reserve any slop when it @@ -767,6 +774,9 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, ext4_update_bh_state(bh, map.m_flags); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; + } else if (ret == 0) { + /* hole case, need to fill in bh->b_size */ + bh->b_size = inode->i_sb->s_blocksize * map.m_len; } return ret; } @@ -1166,7 +1176,8 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (unlikely(err)) page_zero_new_buffers(page, from, to); else if (decrypt) - err = fscrypt_decrypt_page(page); + err = fscrypt_decrypt_page(page->mapping->host, page, + PAGE_SIZE, 0, page->index); return err; } #endif @@ -2891,7 +2902,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, index = pos >> PAGE_SHIFT; - if (ext4_nonda_switch(inode->i_sb)) { + if (ext4_nonda_switch(inode->i_sb) || + S_ISLNK(inode->i_mode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, len, flags, pagep, fsdata); @@ -3268,53 +3280,159 @@ static int ext4_releasepage(struct page *page, gfp_t wait) } #ifdef CONFIG_FS_DAX -/* - * Get block function for DAX IO and mmap faults. It takes care of converting - * unwritten extents to written ones and initializes new / converted blocks - * to zeros. - */ -int ext4_dax_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned flags, struct iomap *iomap) { + unsigned int blkbits = inode->i_blkbits; + unsigned long first_block = offset >> blkbits; + unsigned long last_block = (offset + length - 1) >> blkbits; + struct ext4_map_blocks map; int ret; - ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create); - if (!create) - return _ext4_get_block(inode, iblock, bh_result, 0); + if (WARN_ON_ONCE(ext4_has_inline_data(inode))) + return -ERANGE; - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_PRE_IO | - EXT4_GET_BLOCKS_CREATE_ZERO); - if (ret < 0) - return ret; + map.m_lblk = first_block; + map.m_len = last_block - first_block + 1; + + if (!(flags & IOMAP_WRITE)) { + ret = ext4_map_blocks(NULL, inode, &map, 0); + } else { + int dio_credits; + handle_t *handle; + int retries = 0; - if (buffer_unwritten(bh_result)) { + /* Trim mapping request to maximum we can map at once for DIO */ + if (map.m_len > DIO_MAX_BLOCKS) + map.m_len = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); +retry: /* - * We are protected by i_mmap_sem or i_mutex so we know block - * cannot go away from under us even though we dropped - * i_data_sem. Convert extent to written and write zeros there. + * Either we allocate blocks and then we don't get unwritten + * extent so we have reserved enough credits, or the blocks + * are already allocated and unwritten and in that case + * extent conversion fits in the credits as well. */ - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_CONVERT | - EXT4_GET_BLOCKS_CREATE_ZERO); - if (ret < 0) + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + dio_credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ret = ext4_map_blocks(handle, inode, &map, + EXT4_GET_BLOCKS_CREATE_ZERO); + if (ret < 0) { + ext4_journal_stop(handle); + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; return ret; + } + + /* + * If we added blocks beyond i_size, we need to make sure they + * will get truncated if we crash before updating i_size in + * ext4_iomap_end(). For faults we don't need to do that (and + * even cannot because for orphan list operations inode_lock is + * required) - if we happen to instantiate block beyond i_size, + * it is because we race with truncate which has already added + * the inode to the orphan list. + */ + if (!(flags & IOMAP_FAULT) && first_block + map.m_len > + (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) { + int err; + + err = ext4_orphan_add(handle, inode); + if (err < 0) { + ext4_journal_stop(handle); + return err; + } + } + ext4_journal_stop(handle); } - /* - * At least for now we have to clear BH_New so that DAX code - * doesn't attempt to zero blocks again in a racy way. - */ - clear_buffer_new(bh_result); + + iomap->flags = 0; + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = first_block << blkbits; + + if (ret == 0) { + iomap->type = IOMAP_HOLE; + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->length = (u64)map.m_len << blkbits; + } else { + if (map.m_flags & EXT4_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + } else if (map.m_flags & EXT4_MAP_UNWRITTEN) { + iomap->type = IOMAP_UNWRITTEN; + } else { + WARN_ON_ONCE(1); + return -EIO; + } + iomap->blkno = (sector_t)map.m_pblk << (blkbits - 9); + iomap->length = (u64)map.m_len << blkbits; + } + + if (map.m_flags & EXT4_MAP_NEW) + iomap->flags |= IOMAP_F_NEW; return 0; } -#else -/* Just define empty function, it will never get called. */ -int ext4_dax_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) + +static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, + ssize_t written, unsigned flags, struct iomap *iomap) { - BUG(); - return 0; + int ret = 0; + handle_t *handle; + int blkbits = inode->i_blkbits; + bool truncate = false; + + if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) + return 0; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto orphan_del; + } + if (ext4_update_inode_size(inode, offset + written)) + ext4_mark_inode_dirty(handle, inode); + /* + * We may need to truncate allocated but not written blocks beyond EOF. + */ + if (iomap->offset + iomap->length > + ALIGN(inode->i_size, 1 << blkbits)) { + ext4_lblk_t written_blk, end_blk; + + written_blk = (offset + written) >> blkbits; + end_blk = (offset + length) >> blkbits; + if (written_blk < end_blk && ext4_can_truncate(inode)) + truncate = true; + } + /* + * Remove inode from orphan list if we were extending a inode and + * everything went fine. + */ + if (!truncate && inode->i_nlink && + !list_empty(&EXT4_I(inode)->i_orphan)) + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + if (truncate) { + ext4_truncate_failed_write(inode); +orphan_del: + /* + * If truncate failed early the inode might still be on the + * orphan list; we need to make sure the inode is removed from + * the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + return ret; } + +struct iomap_ops ext4_iomap_ops = { + .iomap_begin = ext4_iomap_begin, + .iomap_end = ext4_iomap_end, +}; + #endif static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, @@ -3436,19 +3554,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) iocb->private = NULL; if (overwrite) get_block_func = ext4_dio_get_block_overwrite; - else if (IS_DAX(inode)) { - /* - * We can avoid zeroing for aligned DAX writes beyond EOF. Other - * writes need zeroing either because they can race with page - * faults or because they use partial blocks. - */ - if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size && - ext4_aligned_io(inode, offset, count)) - get_block_func = ext4_dio_get_block; - else - get_block_func = ext4_dax_get_block; - dio_flags = DIO_LOCKING; - } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || + else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) { get_block_func = ext4_dio_get_block; dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; @@ -3462,14 +3568,9 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) #ifdef CONFIG_EXT4_FS_ENCRYPTION BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); #endif - if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, iter, get_block_func, - ext4_end_io_dio, dio_flags); - } else - ret = __blockdev_direct_IO(iocb, inode, - inode->i_sb->s_bdev, iter, - get_block_func, - ext4_end_io_dio, NULL, dio_flags); + ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, + get_block_func, ext4_end_io_dio, NULL, + dio_flags); if (ret > 0 && !overwrite && ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN)) { @@ -3538,6 +3639,7 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; /* @@ -3546,19 +3648,12 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) * we are protected against page writeback as well. */ inode_lock_shared(inode); - if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, NULL, 0); - } else { - size_t count = iov_iter_count(iter); - - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, - iocb->ki_pos + count); - if (ret) - goto out_unlock; - ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, - iter, ext4_dio_get_block, - NULL, NULL, 0); - } + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, + iocb->ki_pos + count); + if (ret) + goto out_unlock; + ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, + iter, ext4_dio_get_block, NULL, NULL, 0); out_unlock: inode_unlock_shared(inode); return ret; @@ -3587,6 +3682,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (ext4_has_inline_data(inode)) return 0; + /* DAX uses iomap path now */ + if (WARN_ON_ONCE(IS_DAX(inode))) + return 0; + trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (iov_iter_rw(iter) == READ) ret = ext4_direct_IO_read(iocb, iter); @@ -3615,6 +3714,13 @@ static int ext4_journalled_set_page_dirty(struct page *page) return __set_page_dirty_nobuffers(page); } +static int ext4_set_page_dirty(struct page *page) +{ + WARN_ON_ONCE(!PageLocked(page) && !PageDirty(page)); + WARN_ON_ONCE(!page_has_buffers(page)); + return __set_page_dirty_buffers(page); +} + static const struct address_space_operations ext4_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, @@ -3622,6 +3728,7 @@ static const struct address_space_operations ext4_aops = { .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_write_end, + .set_page_dirty = ext4_set_page_dirty, .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, @@ -3654,6 +3761,7 @@ static const struct address_space_operations ext4_da_aops = { .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, + .set_page_dirty = ext4_set_page_dirty, .bmap = ext4_bmap, .invalidatepage = ext4_da_invalidatepage, .releasepage = ext4_releasepage, @@ -3743,7 +3851,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); BUG_ON(blocksize != PAGE_SIZE); - WARN_ON_ONCE(fscrypt_decrypt_page(page)); + WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host, + page, PAGE_SIZE, 0, page->index)); } } if (ext4_should_journal_data(inode)) { @@ -3792,8 +3901,10 @@ static int ext4_block_zero_page_range(handle_t *handle, if (length > max || length < 0) length = max; - if (IS_DAX(inode)) - return dax_zero_page_range(inode, from, length, ext4_get_block); + if (IS_DAX(inode)) { + return iomap_zero_range(inode, from, length, NULL, + &ext4_iomap_ops); + } return __ext4_block_zero_page_range(handle, mapping, from, length); } @@ -4026,7 +4137,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) if (IS_SYNC(inode)) ext4_handle_sync(handle); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: ext4_journal_stop(handle); @@ -4091,10 +4202,11 @@ int ext4_inode_attach_jinode(struct inode *inode) * that's fine - as long as they are linked from the inode, the post-crash * ext4_truncate() run will find them and release them. */ -void ext4_truncate(struct inode *inode) +int ext4_truncate(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); unsigned int credits; + int err = 0; handle_t *handle; struct address_space *mapping = inode->i_mapping; @@ -4108,7 +4220,7 @@ void ext4_truncate(struct inode *inode) trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) - return; + return 0; ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); @@ -4120,13 +4232,13 @@ void ext4_truncate(struct inode *inode) ext4_inline_data_truncate(inode, &has_inline); if (has_inline) - return; + return 0; } /* If we zero-out tail of the page, we have to create jinode for jbd2 */ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { if (ext4_inode_attach_jinode(inode) < 0) - return; + return 0; } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -4135,10 +4247,8 @@ void ext4_truncate(struct inode *inode) credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - ext4_std_error(inode->i_sb, PTR_ERR(handle)); - return; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); if (inode->i_size & (inode->i_sb->s_blocksize - 1)) ext4_block_truncate_page(handle, mapping, inode->i_size); @@ -4152,7 +4262,8 @@ void ext4_truncate(struct inode *inode) * Implication: the file must always be in a sane, consistent * truncatable state while each transaction commits. */ - if (ext4_orphan_add(handle, inode)) + err = ext4_orphan_add(handle, inode); + if (err) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); @@ -4160,11 +4271,13 @@ void ext4_truncate(struct inode *inode) ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ext4_ext_truncate(handle, inode); + err = ext4_ext_truncate(handle, inode); else ext4_ind_truncate(handle, inode); up_write(&ei->i_data_sem); + if (err) + goto out_stop; if (IS_SYNC(inode)) ext4_handle_sync(handle); @@ -4180,11 +4293,12 @@ out_stop: if (inode->i_nlink) ext4_orphan_del(handle, inode); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); trace_ext4_truncate_exit(inode); + return err; } /* @@ -4352,7 +4466,9 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode)) + if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode) && + !ext4_should_journal_data(inode) && !ext4_has_inline_data(inode) && + !ext4_encrypted_inode(inode)) new_fl |= S_DAX; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); @@ -4411,7 +4527,9 @@ static inline void ext4_iget_extra_inode(struct inode *inode, { __le32 *magic = (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; - if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { + if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <= + EXT4_INODE_SIZE(inode->i_sb) && + *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { ext4_set_inode_state(inode, EXT4_STATE_XATTR); ext4_find_inline_data_nolock(inode); } else @@ -4434,6 +4552,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) struct inode *inode; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; + loff_t size; int block; uid_t i_uid; gid_t i_gid; @@ -4456,10 +4575,12 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > - EXT4_INODE_SIZE(inode->i_sb)) { - EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)", - EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize, - EXT4_INODE_SIZE(inode->i_sb)); + EXT4_INODE_SIZE(inode->i_sb) || + (ei->i_extra_isize & 3)) { + EXT4_ERROR_INODE(inode, + "bad extra_isize %u (inode size %u)", + ei->i_extra_isize, + EXT4_INODE_SIZE(inode->i_sb)); ret = -EFSCORRUPTED; goto bad_inode; } @@ -4534,6 +4655,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(raw_inode); + if ((size = i_size_read(inode)) < 0) { + EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); + ret = -EFSCORRUPTED; + goto bad_inode; + } ei->i_disksize = inode->i_size; #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; @@ -4577,6 +4703,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ + BUILD_BUG_ON(sizeof(struct ext4_inode) & 3); ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { @@ -5154,7 +5281,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * update c/mtime in shrink case below */ if (!shrink) { - inode->i_mtime = ext4_current_time(inode); + inode->i_mtime = current_time(inode); inode->i_ctime = inode->i_mtime; } down_write(&EXT4_I(inode)->i_data_sem); @@ -5199,12 +5326,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * in data=journal mode to make pages freeable. */ truncate_pagecache(inode, inode->i_size); - if (shrink) - ext4_truncate(inode); + if (shrink) { + rc = ext4_truncate(inode); + if (rc) + error = rc; + } up_write(&EXT4_I(inode)->i_mmap_sem); } - if (!rc) { + if (!error) { setattr_copy(inode, attr); mark_inode_dirty(inode); } @@ -5216,7 +5346,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (orphan && inode->i_nlink) ext4_orphan_del(NULL, inode); - if (!rc && (ia_valid & ATTR_MODE)) + if (!error && (ia_valid & ATTR_MODE)) rc = posix_acl_chmod(inode, inode->i_mode); err_out: @@ -5455,18 +5585,20 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) return err; - if (ext4_handle_valid(handle) && - EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && + if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { /* - * We need extra buffer credits since we may write into EA block + * In nojournal mode, we can immediately attempt to expand + * the inode. When journaled, we first need to obtain extra + * buffer credits since we may write into the EA block * with this same handle. If journal_extend fails, then it will * only result in a minor loss of functionality for that inode. * If this is felt to be critical, then e2fsck should be run to * force a large enough s_min_extra_isize. */ - if ((jbd2_journal_extend(handle, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { + if (!ext4_handle_valid(handle) || + jbd2_journal_extend(handle, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) == 0) { ret = ext4_expand_extra_isize(inode, sbi->s_want_extra_isize, iloc, handle); @@ -5620,6 +5752,11 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); } ext4_set_aops(inode); + /* + * Update inode->i_flags after EXT4_INODE_JOURNAL_DATA was updated. + * E.g. S_DAX may get cleared / set. + */ + ext4_set_inode_flags(inode); jbd2_journal_unlock_updates(journal); percpu_up_write(&sbi->s_journal_flag_rwsem); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bf5ae8ebbc97..49fd1371bfa2 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -153,7 +153,7 @@ static long swap_inode_boot_loader(struct super_block *sb, swap_inode_data(inode, inode_bl); - inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode); + inode->i_ctime = inode_bl->i_ctime = current_time(inode); spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; @@ -191,6 +191,7 @@ journal_err_out: return err; } +#ifdef CONFIG_EXT4_FS_ENCRYPTION static int uuid_is_zero(__u8 u[16]) { int i; @@ -200,6 +201,7 @@ static int uuid_is_zero(__u8 u[16]) return 0; return 1; } +#endif static int ext4_ioctl_setflags(struct inode *inode, unsigned int flags) @@ -248,8 +250,11 @@ static int ext4_ioctl_setflags(struct inode *inode, err = -EOPNOTSUPP; goto flags_out; } - } else if (oldflags & EXT4_EOFBLOCKS_FL) - ext4_truncate(inode); + } else if (oldflags & EXT4_EOFBLOCKS_FL) { + err = ext4_truncate(inode); + if (err) + goto flags_out; + } handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { @@ -265,6 +270,9 @@ static int ext4_ioctl_setflags(struct inode *inode, for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { if (!(mask & EXT4_FL_USER_MODIFIABLE)) continue; + /* These flags get special treatment later */ + if (mask == EXT4_JOURNAL_DATA_FL || mask == EXT4_EXTENTS_FL) + continue; if (mask & flags) ext4_set_inode_flag(inode, i); else @@ -272,7 +280,7 @@ static int ext4_ioctl_setflags(struct inode *inode, } ext4_set_inode_flags(inode); - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); err = ext4_mark_iloc_dirty(handle, inode, &iloc); flags_err: @@ -368,7 +376,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) } EXT4_I(inode)->i_projid = kprojid; - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); out_dirty: rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) @@ -409,6 +417,10 @@ static inline __u32 ext4_iflags_to_xflags(unsigned long iflags) return xflags; } +#define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \ + FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \ + FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT) + /* Transfer xflags flags to internal */ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) { @@ -453,12 +465,22 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (get_user(flags, (int __user *) arg)) return -EFAULT; + if (flags & ~EXT4_FL_USER_VISIBLE) + return -EOPNOTSUPP; + /* + * chattr(1) grabs flags via GETFLAGS, modifies the result and + * passes that to SETFLAGS. So we cannot easily make SETFLAGS + * more restrictive than just silently masking off visible but + * not settable flags as we always did. + */ + flags &= EXT4_FL_USER_MODIFIABLE; + if (ext4_mask_flags(inode->i_mode, flags) != flags) + return -EOPNOTSUPP; + err = mnt_want_write_file(filp); if (err) return err; - flags = ext4_mask_flags(inode->i_mode, flags); - inode_lock(inode); err = ext4_ioctl_setflags(inode, flags); inode_unlock(inode); @@ -500,7 +522,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); inode->i_generation = generation; err = ext4_mark_iloc_dirty(handle, inode, &iloc); } @@ -765,28 +787,19 @@ resizefs_out: } case EXT4_IOC_PRECACHE_EXTENTS: return ext4_ext_precache(inode); - case EXT4_IOC_SET_ENCRYPTION_POLICY: { -#ifdef CONFIG_EXT4_FS_ENCRYPTION - struct fscrypt_policy policy; + case EXT4_IOC_SET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); - if (copy_from_user(&policy, - (struct fscrypt_policy __user *)arg, - sizeof(policy))) - return -EFAULT; - return fscrypt_process_policy(filp, &policy); -#else - return -EOPNOTSUPP; -#endif - } case EXT4_IOC_GET_ENCRYPTION_PWSALT: { +#ifdef CONFIG_EXT4_FS_ENCRYPTION int err, err2; struct ext4_sb_info *sbi = EXT4_SB(sb); handle_t *handle; - if (!ext4_sb_has_crypto(sb)) + if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { err = mnt_want_write_file(filp); @@ -816,24 +829,13 @@ resizefs_out: sbi->s_es->s_encrypt_pw_salt, 16)) return -EFAULT; return 0; - } - case EXT4_IOC_GET_ENCRYPTION_POLICY: { -#ifdef CONFIG_EXT4_FS_ENCRYPTION - struct fscrypt_policy policy; - int err = 0; - - if (!ext4_encrypted_inode(inode)) - return -ENOENT; - err = fscrypt_get_policy(inode, &policy); - if (err) - return err; - if (copy_to_user((void __user *)arg, &policy, sizeof(policy))) - return -EFAULT; - return 0; #else return -EOPNOTSUPP; #endif } + case EXT4_IOC_GET_ENCRYPTION_POLICY: + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); + case EXT4_IOC_FSGETXATTR: { struct fsxattr fa; @@ -865,13 +867,17 @@ resizefs_out: if (!inode_owner_or_capable(inode)) return -EACCES; + if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS) + return -EOPNOTSUPP; + + flags = ext4_xflags_to_iflags(fa.fsx_xflags); + if (ext4_mask_flags(inode->i_mode, flags) != flags) + return -EOPNOTSUPP; + err = mnt_want_write_file(filp); if (err) return err; - flags = ext4_xflags_to_iflags(fa.fsx_xflags); - flags = ext4_mask_flags(inode->i_mode, flags); - inode_lock(inode); flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | (flags & EXT4_FL_XFLAG_VISIBLE); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f418f55c2bbe..7ae43c59bc79 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -669,7 +669,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, ext4_grpblk_t min; ext4_grpblk_t max; ext4_grpblk_t chunk; - unsigned short border; + unsigned int border; BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); @@ -2287,7 +2287,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) struct ext4_group_info *grinfo; struct sg { struct ext4_group_info info; - ext4_grpblk_t counters[16]; + ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; } sg; group--; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 104f8bfba718..eadba919f26b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1941,7 +1941,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); dir->i_version++; ext4_mark_inode_dirty(handle, dir); @@ -2987,7 +2987,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) * recovery. */ inode->i_size = 0; ext4_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_dec_count(handle, dir); ext4_update_dx_flag(dir); @@ -3050,13 +3050,13 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_unlink; - dir->i_ctime = dir->i_mtime = ext4_current_time(dir); + dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); ext4_mark_inode_dirty(handle, dir); drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); end_unlink: @@ -3254,7 +3254,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); ext4_inc_count(handle, inode); ihold(inode); @@ -3381,7 +3381,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, ent->de->file_type = file_type; ent->dir->i_version++; ent->dir->i_ctime = ent->dir->i_mtime = - ext4_current_time(ent->dir); + current_time(ent->dir); ext4_mark_inode_dirty(handle, ent->dir); BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); if (!ent->inlined) { @@ -3651,7 +3651,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old.inode->i_ctime = ext4_current_time(old.inode); + old.inode->i_ctime = current_time(old.inode); ext4_mark_inode_dirty(handle, old.inode); if (!whiteout) { @@ -3663,9 +3663,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (new.inode) { ext4_dec_count(handle, new.inode); - new.inode->i_ctime = ext4_current_time(new.inode); + new.inode->i_ctime = current_time(new.inode); } - old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir); + old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir); ext4_update_dx_flag(old.dir); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); @@ -3723,6 +3723,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, }; u8 new_file_type; int retval; + struct timespec ctime; if ((ext4_encrypted_inode(old_dir) || ext4_encrypted_inode(new_dir)) && @@ -3823,8 +3824,9 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old.inode->i_ctime = ext4_current_time(old.inode); - new.inode->i_ctime = ext4_current_time(new.inode); + ctime = current_time(old.inode); + old.inode->i_ctime = ctime; + new.inode->i_ctime = ctime; ext4_mark_inode_dirty(handle, old.inode); ext4_mark_inode_dirty(handle, new.inode); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index e0b3b54cdef3..e2332a65e8fb 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -470,7 +470,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, gfp_t gfp_flags = GFP_NOFS; retry_encrypt: - data_page = fscrypt_encrypt_page(inode, page, gfp_flags); + data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0, + page->index, gfp_flags); if (IS_ERR(data_page)) { ret = PTR_ERR(data_page); if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index caa4147cda47..dfc8309d7755 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -863,7 +863,6 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); percpu_free_rwsem(&sbi->s_journal_flag_rwsem); - brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); @@ -895,6 +894,7 @@ static void ext4_put_super(struct super_block *sb) } if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); + brelse(sbi->s_sbh); sb->s_fs_info = NULL; /* * Now that we are completely done shutting down the @@ -1114,37 +1114,55 @@ static int ext4_prepare_context(struct inode *inode) static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { - handle_t *handle; - int res, res2; + handle_t *handle = fs_data; + int res, res2, retries = 0; + + /* + * If a journal handle was specified, then the encryption context is + * being set on a new inode via inheritance and is part of a larger + * transaction to create the inode. Otherwise the encryption context is + * being set on an existing inode in its own transaction. Only in the + * latter case should the "retry on ENOSPC" logic be used. + */ - /* fs_data is null when internally used. */ - if (fs_data) { - res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, - len, 0); + if (handle) { + res = ext4_xattr_set_handle(handle, inode, + EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + /* + * Update inode->i_flags - e.g. S_DAX may get disabled + */ + ext4_set_inode_flags(inode); } return res; } +retry: handle = ext4_journal_start(inode, EXT4_HT_MISC, ext4_jbd2_credits_xattr(inode)); if (IS_ERR(handle)) return PTR_ERR(handle); - res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, - len, 0); + res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + /* Update inode->i_flags - e.g. S_DAX may get disabled */ + ext4_set_inode_flags(inode); res = ext4_mark_inode_dirty(handle, inode); if (res) EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); } res2 = ext4_journal_stop(handle); + + if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; if (!res) res = res2; return res; @@ -1883,12 +1901,6 @@ static int parse_options(char *options, struct super_block *sb, return 0; } } - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && - test_opt(sb, JOURNAL_ASYNC_COMMIT)) { - ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit " - "in data=ordered mode"); - return 0; - } return 1; } @@ -2330,7 +2342,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) { unsigned int s_flags = sb->s_flags; - int nr_orphans = 0, nr_truncates = 0; + int ret, nr_orphans = 0, nr_truncates = 0; #ifdef CONFIG_QUOTA int i; #endif @@ -2412,7 +2424,9 @@ static void ext4_orphan_cleanup(struct super_block *sb, inode->i_ino, inode->i_size); inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); - ext4_truncate(inode); + ret = ext4_truncate(inode); + if (ret) + ext4_std_error(inode->i_sb, ret); inode_unlock(inode); nr_truncates++; } else { @@ -3193,10 +3207,15 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp, ext4_set_bit(s++, buf); count++; } - for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) { - ext4_set_bit(EXT4_B2C(sbi, s++), buf); - count++; + j = ext4_bg_num_gdb(sb, grp); + if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) { + ext4_error(sb, "Invalid number of block group " + "descriptor blocks: %d", j); + j = EXT4_BLOCKS_PER_GROUP(sb) - s; } + count += j; + for (; j > 0; j--) + ext4_set_bit(EXT4_B2C(sbi, s++), buf); } if (!count) return 0; @@ -3301,7 +3320,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) char *orig_data = kstrdup(data, GFP_KERNEL); struct buffer_head *bh; struct ext4_super_block *es = NULL; - struct ext4_sb_info *sbi; + struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); ext4_fsblk_t block; ext4_fsblk_t sb_block = get_sb_block(&data); ext4_fsblk_t logical_sb_block; @@ -3320,16 +3339,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; ext4_group_t first_not_zeroed; - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - if (!sbi) - goto out_free_orig; + if ((data && !orig_data) || !sbi) + goto out_free_base; sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); - if (!sbi->s_blockgroup_lock) { - kfree(sbi); - goto out_free_orig; - } + if (!sbi->s_blockgroup_lock) + goto out_free_base; + sb->s_fs_info = sbi; sbi->s_sb = sb; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; @@ -3475,11 +3492,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; - if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, - &journal_devnum, &journal_ioprio, 0)) { - ext4_msg(sb, KERN_WARNING, - "failed to parse options in superblock: %s", - sbi->s_es->s_mount_opts); + if (sbi->s_es->s_mount_opts[0]) { + char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, + sizeof(sbi->s_es->s_mount_opts), + GFP_KERNEL); + if (!s_mount_opts) + goto failed_mount; + if (!parse_options(s_mount_opts, sb, &journal_devnum, + &journal_ioprio, 0)) { + ext4_msg(sb, KERN_WARNING, + "failed to parse options in superblock: %s", + s_mount_opts); + } + kfree(s_mount_opts); } sbi->s_def_mount_opt = sbi->s_mount_opt; if (!parse_options((char *) data, sb, &journal_devnum, @@ -3505,6 +3530,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "both data=journal and dax"); goto failed_mount; } + if (ext4_has_feature_encrypt(sb)) { + ext4_msg(sb, KERN_WARNING, + "encrypted files will use data=ordered " + "instead of data journaling mode"); + } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } else { @@ -3660,12 +3690,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); - if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0) - goto cantfind_ext4; sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); if (sbi->s_inodes_per_block == 0) goto cantfind_ext4; + if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || + sbi->s_inodes_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", + sbi->s_blocks_per_group); + goto failed_mount; + } sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block; sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); @@ -3748,13 +3782,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sbi->s_cluster_ratio = clustersize / blocksize; - if (sbi->s_inodes_per_group > blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#inodes per group too big: %lu", - sbi->s_inodes_per_group); - goto failed_mount; - } - /* Do we have standard group size of clustersize * 8 blocks ? */ if (sbi->s_blocks_per_group == clustersize << 3) set_opt2(sb, STD_GROUP_SIZE); @@ -3814,6 +3841,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); + if (ext4_has_feature_meta_bg(sb)) { + if (le32_to_cpu(es->s_first_meta_bg) >= db_count) { + ext4_msg(sb, KERN_WARNING, + "first meta block group too large: %u " + "(group descriptor block count %u)", + le32_to_cpu(es->s_first_meta_bg), db_count); + goto failed_mount; + } + } sbi->s_group_desc = ext4_kvmalloc(db_count * sizeof(struct buffer_head *), GFP_KERNEL); @@ -3967,6 +4003,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) default: break; } + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && + test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "journal_async_commit in data=ordered mode"); + goto failed_mount_wq; + } + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; @@ -4160,7 +4204,9 @@ no_journal: if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " - "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, + "Opts: %.*s%s%s", descr, + (int) sizeof(sbi->s_es->s_mount_opts), + sbi->s_es->s_mount_opts, *sbi->s_es->s_mount_opts ? "; " : "", orig_data); if (es->s_error_count) @@ -4239,8 +4285,8 @@ failed_mount: out_fail: sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); +out_free_base: kfree(sbi); -out_free_orig: kfree(orig_data); return err ? err : ret; } @@ -4550,7 +4596,8 @@ static int ext4_commit_super(struct super_block *sb, int sync) &EXT4_SB(sb)->s_freeinodes_counter)); BUFFER_TRACE(sbh, "marking dirty"); ext4_superblock_csum_set(sb); - lock_buffer(sbh); + if (sync) + lock_buffer(sbh); if (buffer_write_io_error(sbh)) { /* * Oh, dear. A previous attempt to write the @@ -4566,8 +4613,8 @@ static int ext4_commit_super(struct super_block *sb, int sync) set_buffer_uptodate(sbh); } mark_buffer_dirty(sbh); - unlock_buffer(sbh); if (sync) { + unlock_buffer(sbh); error = __sync_dirty_buffer(sbh, test_opt(sb, BARRIER) ? REQ_FUA : REQ_SYNC); if (error) @@ -4857,6 +4904,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } + } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) { + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "journal_async_commit in data=ordered mode"); + err = -EINVAL; + goto restore_opts; + } } if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { @@ -5366,7 +5420,7 @@ static int ext4_quota_off(struct super_block *sb, int type) handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) goto out; - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index d77be9e9f535..5a94fa52b74f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -185,6 +185,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, { struct ext4_xattr_entry *e = entry; + /* Find the end of the names list */ while (!IS_LAST_ENTRY(e)) { struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); if ((void *)next >= end) @@ -192,15 +193,29 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, e = next; } + /* Check the values */ while (!IS_LAST_ENTRY(entry)) { if (entry->e_value_block != 0) return -EFSCORRUPTED; - if (entry->e_value_size != 0 && - (value_start + le16_to_cpu(entry->e_value_offs) < - (void *)e + sizeof(__u32) || - value_start + le16_to_cpu(entry->e_value_offs) + - le32_to_cpu(entry->e_value_size) > end)) - return -EFSCORRUPTED; + if (entry->e_value_size != 0) { + u16 offs = le16_to_cpu(entry->e_value_offs); + u32 size = le32_to_cpu(entry->e_value_size); + void *value; + + /* + * The value cannot overlap the names, and the value + * with padding cannot extend beyond 'end'. Check both + * the padded and unpadded sizes, since the size may + * overflow to 0 when adding padding. + */ + if (offs > end - value_start) + return -EFSCORRUPTED; + value = value_start + offs; + if (value < (void *)e + sizeof(u32) || + size > end - value || + EXT4_XATTR_SIZE(size) > end - value) + return -EFSCORRUPTED; + } entry = EXT4_XATTR_NEXT(entry); } @@ -231,13 +246,12 @@ static int __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, void *end, const char *function, unsigned int line) { - struct ext4_xattr_entry *entry = IFIRST(header); int error = -EFSCORRUPTED; - if (((void *) header >= end) || + if (end - (void *)header < sizeof(*header) + sizeof(u32) || (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC))) goto errout; - error = ext4_xattr_check_names(entry, end, entry); + error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header)); errout: if (error) __ext4_error_inode(inode, function, line, 0, @@ -1109,7 +1123,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, return 0; } -static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, +static int ext4_xattr_ibody_set(struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { @@ -1216,7 +1230,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, } if (!value) { if (!is.s.not_found) - error = ext4_xattr_ibody_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_set(inode, &i, &is); else if (!bs.s.not_found) error = ext4_xattr_block_set(handle, inode, &i, &bs); } else { @@ -1227,7 +1241,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) goto cleanup; - error = ext4_xattr_ibody_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_set(inode, &i, &is); if (!error && !bs.s.not_found) { i.value = NULL; error = ext4_xattr_block_set(handle, inode, &i, &bs); @@ -1242,14 +1256,13 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, goto cleanup; if (!is.s.not_found) { i.value = NULL; - error = ext4_xattr_ibody_set(handle, inode, &i, - &is); + error = ext4_xattr_ibody_set(inode, &i, &is); } } } if (!error) { ext4_xattr_update_super_block(handle, inode->i_sb); - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); if (!value) ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); @@ -1384,7 +1397,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, goto out; /* Remove the chosen entry from the inode */ - error = ext4_xattr_ibody_set(handle, inode, &i, is); + error = ext4_xattr_ibody_set(inode, &i, is); if (error) goto out; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7c344b3ad70f..9ac262564fa6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1246,7 +1246,9 @@ int do_write_data_page(struct f2fs_io_info *fio) fio->old_blkaddr); retry_encrypt: fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, - gfp_flags); + PAGE_SIZE, 0, + fio->page->index, + gfp_flags); if (IS_ERR(fio->encrypted_page)) { err = PTR_ERR(fio->encrypted_page); if (err == -ENOMEM) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 23c86e8cf523..2da8c3aa0ce5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2520,8 +2520,8 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page #define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page #define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range -#define fscrypt_process_policy fscrypt_notsupp_process_policy -#define fscrypt_get_policy fscrypt_notsupp_get_policy +#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy +#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy #define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context #define fscrypt_inherit_context fscrypt_notsupp_inherit_context #define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 383b5c29f46b..49f10dce817d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1762,31 +1762,16 @@ static bool uuid_is_nonzero(__u8 u[16]) static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { - struct fscrypt_policy policy; struct inode *inode = file_inode(filp); - if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, - sizeof(policy))) - return -EFAULT; - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return fscrypt_process_policy(filp, &policy); + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { - struct fscrypt_policy policy; - struct inode *inode = file_inode(filp); - int err; - - err = fscrypt_get_policy(inode, &policy); - if (err) - return err; - - if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy))) - return -EFAULT; - return 0; + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); } static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) diff --git a/fs/iomap.c b/fs/iomap.c index a8ee8c33ca78..13dd413b2b9c 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -467,8 +467,9 @@ int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, offset = page_offset(page); while (length > 0) { - ret = iomap_apply(inode, offset, length, IOMAP_WRITE, - ops, page, iomap_page_mkwrite_actor); + ret = iomap_apply(inode, offset, length, + IOMAP_WRITE | IOMAP_FAULT, ops, page, + iomap_page_mkwrite_actor); if (unlikely(ret <= 0)) goto out_unlock; offset += ret; diff --git a/fs/mbcache.c b/fs/mbcache.c index c5bd19ffa326..b19be429d655 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -29,7 +29,7 @@ struct mb_cache { /* log2 of hash table size */ int c_bucket_bits; /* Maximum entries in cache to avoid degrading hash too much */ - int c_max_entries; + unsigned long c_max_entries; /* Protects c_list, c_entry_count */ spinlock_t c_list_lock; struct list_head c_list; @@ -43,7 +43,7 @@ struct mb_cache { static struct kmem_cache *mb_entry_cache; static unsigned long mb_cache_shrink(struct mb_cache *cache, - unsigned int nr_to_scan); + unsigned long nr_to_scan); static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache, u32 key) @@ -155,12 +155,12 @@ out: } /* - * mb_cache_entry_find_first - find the first entry in cache with given key + * mb_cache_entry_find_first - find the first reusable entry with the given key * @cache: cache where we should search * @key: key to look for * - * Search in @cache for entry with key @key. Grabs reference to the first - * entry found and returns the entry. + * Search in @cache for a reusable entry with key @key. Grabs reference to the + * first reusable entry found and returns the entry. */ struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, u32 key) @@ -170,14 +170,14 @@ struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, EXPORT_SYMBOL(mb_cache_entry_find_first); /* - * mb_cache_entry_find_next - find next entry in cache with the same + * mb_cache_entry_find_next - find next reusable entry with the same key * @cache: cache where we should search * @entry: entry to start search from * - * Finds next entry in the hash chain which has the same key as @entry. - * If @entry is unhashed (which can happen when deletion of entry races - * with the search), finds the first entry in the hash chain. The function - * drops reference to @entry and returns with a reference to the found entry. + * Finds next reusable entry in the hash chain which has the same key as @entry. + * If @entry is unhashed (which can happen when deletion of entry races with the + * search), finds the first reusable entry in the hash chain. The function drops + * reference to @entry and returns with a reference to the found entry. */ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, struct mb_cache_entry *entry) @@ -274,11 +274,11 @@ static unsigned long mb_cache_count(struct shrinker *shrink, /* Shrink number of entries in cache */ static unsigned long mb_cache_shrink(struct mb_cache *cache, - unsigned int nr_to_scan) + unsigned long nr_to_scan) { struct mb_cache_entry *entry; struct hlist_bl_head *head; - unsigned int shrunk = 0; + unsigned long shrunk = 0; spin_lock(&cache->c_list_lock); while (nr_to_scan-- && !list_empty(&cache->c_list)) { @@ -286,7 +286,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, struct mb_cache_entry, e_list); if (entry->e_referenced) { entry->e_referenced = 0; - list_move_tail(&cache->c_list, &entry->e_list); + list_move_tail(&entry->e_list, &cache->c_list); continue; } list_del_init(&entry->e_list); @@ -316,10 +316,9 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, static unsigned long mb_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - int nr_to_scan = sc->nr_to_scan; struct mb_cache *cache = container_of(shrink, struct mb_cache, c_shrink); - return mb_cache_shrink(cache, nr_to_scan); + return mb_cache_shrink(cache, sc->nr_to_scan); } /* We shrink 1/X of the cache when we have too many entries in it */ @@ -341,11 +340,8 @@ static void mb_cache_shrink_worker(struct work_struct *work) struct mb_cache *mb_cache_create(int bucket_bits) { struct mb_cache *cache; - int bucket_count = 1 << bucket_bits; - int i; - - if (!try_module_get(THIS_MODULE)) - return NULL; + unsigned long bucket_count = 1UL << bucket_bits; + unsigned long i; cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL); if (!cache) @@ -377,7 +373,6 @@ struct mb_cache *mb_cache_create(int bucket_bits) return cache; err_out: - module_put(THIS_MODULE); return NULL; } EXPORT_SYMBOL(mb_cache_create); @@ -411,7 +406,6 @@ void mb_cache_destroy(struct mb_cache *cache) } kfree(cache->c_hash); kfree(cache); - module_put(THIS_MODULE); } EXPORT_SYMBOL(mb_cache_destroy); @@ -420,7 +414,8 @@ static int __init mbcache_init(void) mb_entry_cache = kmem_cache_create("mbcache", sizeof(struct mb_cache_entry), 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); - BUG_ON(!mb_entry_cache); + if (!mb_entry_cache) + return -ENOMEM; return 0; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 6be5204a06d3..38755ca96c7a 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1297,8 +1297,7 @@ __xfs_get_blocks( sector_t iblock, struct buffer_head *bh_result, int create, - bool direct, - bool dax_fault) + bool direct) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1419,13 +1418,8 @@ __xfs_get_blocks( if (ISUNWRITTEN(&imap)) set_buffer_unwritten(bh_result); /* direct IO needs special help */ - if (create) { - if (dax_fault) - ASSERT(!ISUNWRITTEN(&imap)); - else - xfs_map_direct(inode, bh_result, &imap, offset, - is_cow); - } + if (create) + xfs_map_direct(inode, bh_result, &imap, offset, is_cow); } /* @@ -1465,7 +1459,7 @@ xfs_get_blocks( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, false, false); + return __xfs_get_blocks(inode, iblock, bh_result, create, false); } int @@ -1475,17 +1469,7 @@ xfs_get_blocks_direct( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, true, false); -} - -int -xfs_get_blocks_dax_fault( - struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - int create) -{ - return __xfs_get_blocks(inode, iblock, bh_result, create, true, true); + return __xfs_get_blocks(inode, iblock, bh_result, create, true); } /* diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index b3c6634f9518..34dc00dfb91d 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -59,9 +59,6 @@ int xfs_get_blocks(struct inode *inode, sector_t offset, struct buffer_head *map_bh, int create); int xfs_get_blocks_direct(struct inode *inode, sector_t offset, struct buffer_head *map_bh, int create); -int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, - struct buffer_head *map_bh, int create); - int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset, ssize_t size, void *private); int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 6e4f7f900fea..d818c160451f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -318,7 +318,7 @@ xfs_file_dax_read( return 0; /* skip atime */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops); + ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); file_accessed(iocb->ki_filp); @@ -653,7 +653,7 @@ xfs_file_dax_write( trace_xfs_file_dax_write(ip, count, pos); - ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops); + ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); error = xfs_setfilesize(ip, pos, ret); @@ -1474,7 +1474,7 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); + ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops); } else { ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); @@ -1508,7 +1508,7 @@ xfs_filemap_fault( * changes to xfs_get_blocks_direct() to map unwritten extent * ioend for conversion on read-only mappings. */ - ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); + ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops); } else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1545,7 +1545,7 @@ xfs_filemap_pmd_fault( } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); + ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (flags & FAULT_FLAG_WRITE) diff --git a/include/linux/dax.h b/include/linux/dax.h index add6c4bc568f..0afade8bd3d7 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -8,21 +8,41 @@ struct iomap_ops; -/* We use lowest available exceptional entry bit for locking */ +/* + * We use lowest available bit in exceptional entry for locking, one bit for + * the entry size (PMD) and two more to tell us if the entry is a huge zero + * page (HZP) or an empty entry that is just used for locking. In total four + * special bits. + * + * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the HZP and + * EMPTY bits aren't set the entry is a normal DAX entry with a filesystem + * block allocation. + */ +#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4) #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) +#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) +#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) +#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) -ssize_t iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, +static inline unsigned long dax_radix_sector(void *entry) +{ + return (unsigned long)entry >> RADIX_DAX_SHIFT; +} + +static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags) +{ + return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | + ((unsigned long)sector << RADIX_DAX_SHIFT) | + RADIX_DAX_ENTRY_LOCK); +} + +ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops); -ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, - get_block_t, dio_iodone_t, int flags); -int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); -int dax_truncate_page(struct inode *, loff_t from, get_block_t); -int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, +int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, struct iomap_ops *ops); -int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); void dax_wake_mapping_entry_waiter(struct address_space *mapping, - pgoff_t index, bool wake_all); + pgoff_t index, void *entry, bool wake_all); #ifdef CONFIG_FS_DAX struct page *read_dax_sector(struct block_device *bdev, sector_t n); @@ -48,18 +68,28 @@ static inline int __dax_zero_page_range(struct block_device *bdev, } #endif -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) -int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, - unsigned int flags, get_block_t); +#ifdef CONFIG_FS_DAX_PMD +static inline unsigned int dax_radix_order(void *entry) +{ + if ((unsigned long)entry & RADIX_DAX_PMD) + return PMD_SHIFT - PAGE_SHIFT; + return 0; +} +int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, unsigned int flags, struct iomap_ops *ops); #else -static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned int flags, get_block_t gb) +static inline unsigned int dax_radix_order(void *entry) +{ + return 0; +} +static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, unsigned int flags, + struct iomap_ops *ops) { return VM_FAULT_FALLBACK; } #endif int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); -#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) static inline bool vma_is_dax(struct vm_area_struct *vma) { diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h index ff8b11b26f31..c074b670aa99 100644 --- a/include/linux/fscrypto.h +++ b/include/linux/fscrypto.h @@ -18,73 +18,9 @@ #include <crypto/skcipher.h> #include <uapi/linux/fs.h> -#define FS_KEY_DERIVATION_NONCE_SIZE 16 -#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 - -#define FS_POLICY_FLAGS_PAD_4 0x00 -#define FS_POLICY_FLAGS_PAD_8 0x01 -#define FS_POLICY_FLAGS_PAD_16 0x02 -#define FS_POLICY_FLAGS_PAD_32 0x03 -#define FS_POLICY_FLAGS_PAD_MASK 0x03 -#define FS_POLICY_FLAGS_VALID 0x03 - -/* Encryption algorithms */ -#define FS_ENCRYPTION_MODE_INVALID 0 -#define FS_ENCRYPTION_MODE_AES_256_XTS 1 -#define FS_ENCRYPTION_MODE_AES_256_GCM 2 -#define FS_ENCRYPTION_MODE_AES_256_CBC 3 -#define FS_ENCRYPTION_MODE_AES_256_CTS 4 - -/** - * Encryption context for inode - * - * Protector format: - * 1 byte: Protector format (1 = this version) - * 1 byte: File contents encryption mode - * 1 byte: File names encryption mode - * 1 byte: Flags - * 8 bytes: Master Key descriptor - * 16 bytes: Encryption Key derivation nonce - */ -struct fscrypt_context { - u8 format; - u8 contents_encryption_mode; - u8 filenames_encryption_mode; - u8 flags; - u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; - u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; -} __packed; - -/* Encryption parameters */ -#define FS_XTS_TWEAK_SIZE 16 -#define FS_AES_128_ECB_KEY_SIZE 16 -#define FS_AES_256_GCM_KEY_SIZE 32 -#define FS_AES_256_CBC_KEY_SIZE 32 -#define FS_AES_256_CTS_KEY_SIZE 32 -#define FS_AES_256_XTS_KEY_SIZE 64 -#define FS_MAX_KEY_SIZE 64 - -#define FS_KEY_DESC_PREFIX "fscrypt:" -#define FS_KEY_DESC_PREFIX_SIZE 8 - -/* This is passed in from userspace into the kernel keyring */ -struct fscrypt_key { - u32 mode; - u8 raw[FS_MAX_KEY_SIZE]; - u32 size; -} __packed; - -struct fscrypt_info { - u8 ci_data_mode; - u8 ci_filename_mode; - u8 ci_flags; - struct crypto_skcipher *ci_ctfm; - struct key *ci_keyring_key; - u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; -}; +#define FS_CRYPTO_BLOCK_SIZE 16 -#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 -#define FS_WRITE_PATH_FL 0x00000002 +struct fscrypt_info; struct fscrypt_ctx { union { @@ -102,19 +38,6 @@ struct fscrypt_ctx { u8 mode; /* Encryption mode for tfm */ }; -struct fscrypt_completion_result { - struct completion completion; - int res; -}; - -#define DECLARE_FS_COMPLETION_RESULT(ecr) \ - struct fscrypt_completion_result ecr = { \ - COMPLETION_INITIALIZER((ecr).completion), 0 } - -#define FS_FNAME_NUM_SCATTER_ENTRIES 4 -#define FS_CRYPTO_BLOCK_SIZE 16 -#define FS_FNAME_CRYPTO_DIGEST_SIZE 32 - /** * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. @@ -154,9 +77,15 @@ struct fscrypt_name { #define fname_len(p) ((p)->disk_name.len) /* + * fscrypt superblock flags + */ +#define FS_CFLG_OWN_PAGES (1U << 1) + +/* * crypto opertions for filesystems */ struct fscrypt_operations { + unsigned int flags; int (*get_context)(struct inode *, void *, size_t); int (*key_prefix)(struct inode *, u8 **); int (*prepare_context)(struct inode *); @@ -206,7 +135,7 @@ static inline struct page *fscrypt_control_page(struct page *page) #endif } -static inline int fscrypt_has_encryption_key(struct inode *inode) +static inline int fscrypt_has_encryption_key(const struct inode *inode) { #if IS_ENABLED(CONFIG_FS_ENCRYPTION) return (inode->i_crypt_info != NULL); @@ -238,25 +167,25 @@ static inline void fscrypt_set_d_op(struct dentry *dentry) #if IS_ENABLED(CONFIG_FS_ENCRYPTION) /* crypto.c */ extern struct kmem_cache *fscrypt_info_cachep; -int fscrypt_initialize(void); - -extern struct fscrypt_ctx *fscrypt_get_ctx(struct inode *, gfp_t); +extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t); extern void fscrypt_release_ctx(struct fscrypt_ctx *); -extern struct page *fscrypt_encrypt_page(struct inode *, struct page *, gfp_t); -extern int fscrypt_decrypt_page(struct page *); +extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *, + unsigned int, unsigned int, + u64, gfp_t); +extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int, + unsigned int, u64); extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); extern void fscrypt_pullback_bio_page(struct page **, bool); extern void fscrypt_restore_control_page(struct page *); -extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t, +extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t, unsigned int); /* policy.c */ -extern int fscrypt_process_policy(struct file *, const struct fscrypt_policy *); -extern int fscrypt_get_policy(struct inode *, struct fscrypt_policy *); +extern int fscrypt_ioctl_set_policy(struct file *, const void __user *); +extern int fscrypt_ioctl_get_policy(struct file *, void __user *); extern int fscrypt_has_permitted_context(struct inode *, struct inode *); extern int fscrypt_inherit_context(struct inode *, struct inode *, void *, bool); /* keyinfo.c */ -extern int get_crypt_info(struct inode *); extern int fscrypt_get_encryption_info(struct inode *); extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *); @@ -264,8 +193,8 @@ extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *); extern int fscrypt_setup_filename(struct inode *, const struct qstr *, int lookup, struct fscrypt_name *); extern void fscrypt_free_filename(struct fscrypt_name *); -extern u32 fscrypt_fname_encrypted_size(struct inode *, u32); -extern int fscrypt_fname_alloc_buffer(struct inode *, u32, +extern u32 fscrypt_fname_encrypted_size(const struct inode *, u32); +extern int fscrypt_fname_alloc_buffer(const struct inode *, u32, struct fscrypt_str *); extern void fscrypt_fname_free_buffer(struct fscrypt_str *); extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32, @@ -275,7 +204,7 @@ extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, #endif /* crypto.c */ -static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(struct inode *i, +static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(const struct inode *i, gfp_t f) { return ERR_PTR(-EOPNOTSUPP); @@ -286,13 +215,18 @@ static inline void fscrypt_notsupp_release_ctx(struct fscrypt_ctx *c) return; } -static inline struct page *fscrypt_notsupp_encrypt_page(struct inode *i, - struct page *p, gfp_t f) +static inline struct page *fscrypt_notsupp_encrypt_page(const struct inode *i, + struct page *p, + unsigned int len, + unsigned int offs, + u64 lblk_num, gfp_t f) { return ERR_PTR(-EOPNOTSUPP); } -static inline int fscrypt_notsupp_decrypt_page(struct page *p) +static inline int fscrypt_notsupp_decrypt_page(const struct inode *i, struct page *p, + unsigned int len, unsigned int offs, + u64 lblk_num) { return -EOPNOTSUPP; } @@ -313,21 +247,21 @@ static inline void fscrypt_notsupp_restore_control_page(struct page *p) return; } -static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p, +static inline int fscrypt_notsupp_zeroout_range(const struct inode *i, pgoff_t p, sector_t s, unsigned int f) { return -EOPNOTSUPP; } /* policy.c */ -static inline int fscrypt_notsupp_process_policy(struct file *f, - const struct fscrypt_policy *p) +static inline int fscrypt_notsupp_ioctl_set_policy(struct file *f, + const void __user *arg) { return -EOPNOTSUPP; } -static inline int fscrypt_notsupp_get_policy(struct inode *i, - struct fscrypt_policy *p) +static inline int fscrypt_notsupp_ioctl_get_policy(struct file *f, + void __user *arg) { return -EOPNOTSUPP; } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 7892f55a1866..f185156de74d 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -49,6 +49,7 @@ struct iomap { #define IOMAP_WRITE (1 << 0) /* writing, must allocate blocks */ #define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */ #define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ +#define IOMAP_FAULT (1 << 3) /* mapping for page fault */ struct iomap_ops { /* diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index c1d11df07b28..36da93fbf188 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -258,6 +258,20 @@ struct fsxattr { /* Policy provided via an ioctl on the topmost directory */ #define FS_KEY_DESCRIPTOR_SIZE 8 +#define FS_POLICY_FLAGS_PAD_4 0x00 +#define FS_POLICY_FLAGS_PAD_8 0x01 +#define FS_POLICY_FLAGS_PAD_16 0x02 +#define FS_POLICY_FLAGS_PAD_32 0x03 +#define FS_POLICY_FLAGS_PAD_MASK 0x03 +#define FS_POLICY_FLAGS_VALID 0x03 + +/* Encryption algorithms */ +#define FS_ENCRYPTION_MODE_INVALID 0 +#define FS_ENCRYPTION_MODE_AES_256_XTS 1 +#define FS_ENCRYPTION_MODE_AES_256_GCM 2 +#define FS_ENCRYPTION_MODE_AES_256_CBC 3 +#define FS_ENCRYPTION_MODE_AES_256_CTS 4 + struct fscrypt_policy { __u8 version; __u8 contents_encryption_mode; diff --git a/mm/filemap.c b/mm/filemap.c index 5b4dd03130da..69568388c699 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -135,10 +135,9 @@ static int page_cache_tree_insert(struct address_space *mapping, } else { /* DAX can replace empty locked entry with a hole */ WARN_ON_ONCE(p != - (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | - RADIX_DAX_ENTRY_LOCK)); + dax_radix_locked_entry(0, RADIX_DAX_EMPTY)); /* Wakeup waiters for exceptional entry lock */ - dax_wake_mapping_entry_waiter(mapping, page->index, + dax_wake_mapping_entry_waiter(mapping, page->index, p, false); } } |