diff options
Diffstat (limited to 'fs')
284 files changed, 8966 insertions, 6969 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index f08fbbfafd9a..d1ad3935fb85 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -166,7 +166,7 @@ config TMPFS space. If you unmount a tmpfs instance, everything stored therein is lost. - See <file:Documentation/filesystems/tmpfs.txt> for details. + See <file:Documentation/filesystems/tmpfs.rst> for details. config TMPFS_POSIX_ACL bool "Tmpfs POSIX Access Control Lists" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 62dc4f577ba1..04f86b8c100e 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -36,6 +36,12 @@ config COMPAT_BINFMT_ELF config ARCH_BINFMT_ELF_STATE bool +config ARCH_HAVE_ELF_PROT + bool + +config ARCH_USE_GNU_PROPERTY + bool + config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y if !BINFMT_ELF @@ -72,7 +78,7 @@ config CORE_DUMP_DEFAULT_ELF_HEADERS The core dump behavior can be controlled per process using the /proc/PID/coredump_filter pseudo-file; this setting is - inherited. See Documentation/filesystems/proc.txt for details. + inherited. See Documentation/filesystems/proc.rst for details. This config option changes the default setting of coredump_filter seen at boot time. If unsure, say Y. diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig index df4650dccf68..44738fed6625 100644 --- a/fs/adfs/Kconfig +++ b/fs/adfs/Kconfig @@ -12,7 +12,7 @@ config ADFS_FS The ADFS partition should be the first partition (i.e., /dev/[hs]d?1) on each of your drives. Please read the file - <file:Documentation/filesystems/adfs.txt> for further details. + <file:Documentation/filesystems/adfs.rst> for further details. To compile this code as a module, choose M here: the module will be called adfs. diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig index 84c46b9025c5..eb9d0ab850cb 100644 --- a/fs/affs/Kconfig +++ b/fs/affs/Kconfig @@ -9,7 +9,7 @@ config AFFS_FS FFS partition on your hard drive. Amiga floppies however cannot be read with this driver due to an incompatibility of the floppy controller used in an Amiga and the standard floppy controller in - PCs and workstations. Read <file:Documentation/filesystems/affs.txt> + PCs and workstations. Read <file:Documentation/filesystems/affs.rst> and <file:fs/affs/Changes>. With this driver you can also mount disk files used by Bernd diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index 3fb1f559e317..1ad211d72b3b 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -8,7 +8,7 @@ config AFS_FS If you say Y here, you will get an experimental Andrew File System driver. It currently only supports unsecured read-only AFS access. - See <file:Documentation/filesystems/afs.txt> for more information. + See <file:Documentation/filesystems/afs.rst> for more information. If unsure, say N. @@ -18,7 +18,7 @@ config AFS_DEBUG help Say Y here to make runtime controllable debugging messages appear. - See <file:Documentation/filesystems/afs.txt> for more information. + See <file:Documentation/filesystems/afs.rst> for more information. If unsure, say N. @@ -37,6 +37,6 @@ config AFS_DEBUG_CURSOR the dmesg log if the server rotation algorithm fails to successfully contact a server. - See <file:Documentation/filesystems/afs.txt> for more information. + See <file:Documentation/filesystems/afs.rst> for more information. If unsure, say N. @@ -176,6 +176,7 @@ struct fsync_iocb { struct file *file; struct work_struct work; bool datasync; + struct cred *creds; }; struct poll_iocb { @@ -1589,8 +1590,11 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, static void aio_fsync_work(struct work_struct *work) { struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work); + const struct cred *old_cred = override_creds(iocb->fsync.creds); iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync); + revert_creds(old_cred); + put_cred(iocb->fsync.creds); iocb_put(iocb); } @@ -1604,6 +1608,10 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb, if (unlikely(!req->file->f_op->fsync)) return -EINVAL; + req->creds = prepare_creds(); + if (!req->creds) + return -ENOMEM; + req->datasync = datasync; INIT_WORK(&req->work, aio_fsync_work); schedule_work(&req->work); diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig index 3e1247f07913..3a757805b585 100644 --- a/fs/bfs/Kconfig +++ b/fs/bfs/Kconfig @@ -11,7 +11,7 @@ config BFS_FS on your /stand slice from within Linux. You then also need to say Y to "UnixWare slices support", below. More information about the BFS file system is contained in the file - <file:Documentation/filesystems/bfs.txt>. + <file:Documentation/filesystems/bfs.rst>. If you don't know what this is about, say N. diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 13f25e241ac4..8945671fe0e5 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -40,12 +40,18 @@ #include <linux/sched/coredump.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> +#include <linux/sizes.h> +#include <linux/types.h> #include <linux/cred.h> #include <linux/dax.h> #include <linux/uaccess.h> #include <asm/param.h> #include <asm/page.h> +#ifndef ELF_COMPAT +#define ELF_COMPAT 0 +#endif + #ifndef user_long_t #define user_long_t long #endif @@ -539,7 +545,8 @@ static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp, #endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */ -static inline int make_prot(u32 p_flags) +static inline int make_prot(u32 p_flags, struct arch_elf_state *arch_state, + bool has_interp, bool is_interp) { int prot = 0; @@ -549,7 +556,8 @@ static inline int make_prot(u32 p_flags) prot |= PROT_WRITE; if (p_flags & PF_X) prot |= PROT_EXEC; - return prot; + + return arch_elf_adjust_prot(prot, arch_state, has_interp, is_interp); } /* This is much more generalized than the library routine read function, @@ -559,7 +567,8 @@ static inline int make_prot(u32 p_flags) static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, struct file *interpreter, - unsigned long no_base, struct elf_phdr *interp_elf_phdata) + unsigned long no_base, struct elf_phdr *interp_elf_phdata, + struct arch_elf_state *arch_state) { struct elf_phdr *eppnt; unsigned long load_addr = 0; @@ -591,7 +600,8 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { if (eppnt->p_type == PT_LOAD) { int elf_type = MAP_PRIVATE | MAP_DENYWRITE; - int elf_prot = make_prot(eppnt->p_flags); + int elf_prot = make_prot(eppnt->p_flags, arch_state, + true, true); unsigned long vaddr = 0; unsigned long k, map_addr; @@ -682,6 +692,111 @@ out: * libraries. There is no binary dependent code anywhere else. */ +static int parse_elf_property(const char *data, size_t *off, size_t datasz, + struct arch_elf_state *arch, + bool have_prev_type, u32 *prev_type) +{ + size_t o, step; + const struct gnu_property *pr; + int ret; + + if (*off == datasz) + return -ENOENT; + + if (WARN_ON_ONCE(*off > datasz || *off % ELF_GNU_PROPERTY_ALIGN)) + return -EIO; + o = *off; + datasz -= *off; + + if (datasz < sizeof(*pr)) + return -ENOEXEC; + pr = (const struct gnu_property *)(data + o); + o += sizeof(*pr); + datasz -= sizeof(*pr); + + if (pr->pr_datasz > datasz) + return -ENOEXEC; + + WARN_ON_ONCE(o % ELF_GNU_PROPERTY_ALIGN); + step = round_up(pr->pr_datasz, ELF_GNU_PROPERTY_ALIGN); + if (step > datasz) + return -ENOEXEC; + + /* Properties are supposed to be unique and sorted on pr_type: */ + if (have_prev_type && pr->pr_type <= *prev_type) + return -ENOEXEC; + *prev_type = pr->pr_type; + + ret = arch_parse_elf_property(pr->pr_type, data + o, + pr->pr_datasz, ELF_COMPAT, arch); + if (ret) + return ret; + + *off = o + step; + return 0; +} + +#define NOTE_DATA_SZ SZ_1K +#define GNU_PROPERTY_TYPE_0_NAME "GNU" +#define NOTE_NAME_SZ (sizeof(GNU_PROPERTY_TYPE_0_NAME)) + +static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, + struct arch_elf_state *arch) +{ + union { + struct elf_note nhdr; + char data[NOTE_DATA_SZ]; + } note; + loff_t pos; + ssize_t n; + size_t off, datasz; + int ret; + bool have_prev_type; + u32 prev_type; + + if (!IS_ENABLED(CONFIG_ARCH_USE_GNU_PROPERTY) || !phdr) + return 0; + + /* load_elf_binary() shouldn't call us unless this is true... */ + if (WARN_ON_ONCE(phdr->p_type != PT_GNU_PROPERTY)) + return -ENOEXEC; + + /* If the properties are crazy large, that's too bad (for now): */ + if (phdr->p_filesz > sizeof(note)) + return -ENOEXEC; + + pos = phdr->p_offset; + n = kernel_read(f, ¬e, phdr->p_filesz, &pos); + + BUILD_BUG_ON(sizeof(note) < sizeof(note.nhdr) + NOTE_NAME_SZ); + if (n < 0 || n < sizeof(note.nhdr) + NOTE_NAME_SZ) + return -EIO; + + if (note.nhdr.n_type != NT_GNU_PROPERTY_TYPE_0 || + note.nhdr.n_namesz != NOTE_NAME_SZ || + strncmp(note.data + sizeof(note.nhdr), + GNU_PROPERTY_TYPE_0_NAME, n - sizeof(note.nhdr))) + return -ENOEXEC; + + off = round_up(sizeof(note.nhdr) + NOTE_NAME_SZ, + ELF_GNU_PROPERTY_ALIGN); + if (off > n) + return -ENOEXEC; + + if (note.nhdr.n_descsz > n - off) + return -ENOEXEC; + datasz = off + note.nhdr.n_descsz; + + have_prev_type = false; + do { + ret = parse_elf_property(note.data, &off, datasz, arch, + have_prev_type, &prev_type); + have_prev_type = true; + } while (!ret); + + return ret == -ENOENT ? 0 : ret; +} + static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ @@ -689,6 +804,7 @@ static int load_elf_binary(struct linux_binprm *bprm) int load_addr_set = 0; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; + struct elf_phdr *elf_property_phdata = NULL; unsigned long elf_bss, elf_brk; int bss_prot = 0; int retval, i; @@ -726,6 +842,11 @@ static int load_elf_binary(struct linux_binprm *bprm) for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) { char *elf_interpreter; + if (elf_ppnt->p_type == PT_GNU_PROPERTY) { + elf_property_phdata = elf_ppnt; + continue; + } + if (elf_ppnt->p_type != PT_INTERP) continue; @@ -819,9 +940,14 @@ out_free_interp: goto out_free_dentry; /* Pass PT_LOPROC..PT_HIPROC headers to arch code */ + elf_property_phdata = NULL; elf_ppnt = interp_elf_phdata; for (i = 0; i < interp_elf_ex->e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { + case PT_GNU_PROPERTY: + elf_property_phdata = elf_ppnt; + break; + case PT_LOPROC ... PT_HIPROC: retval = arch_elf_pt_proc(interp_elf_ex, elf_ppnt, interpreter, @@ -832,6 +958,11 @@ out_free_interp: } } + retval = parse_elf_properties(interpreter ?: bprm->file, + elf_property_phdata, &arch_state); + if (retval) + goto out_free_dentry; + /* * Allow arch code to reject the ELF at this point, whilst it's * still possible to return an error to the code that invoked @@ -913,7 +1044,8 @@ out_free_interp: } } - elf_prot = make_prot(elf_ppnt->p_flags); + elf_prot = make_prot(elf_ppnt->p_flags, &arch_state, + !!interpreter, false); elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE; @@ -1056,7 +1188,8 @@ out_free_interp: if (interpreter) { elf_entry = load_elf_interp(interp_elf_ex, interpreter, - load_bias, interp_elf_phdata); + load_bias, interp_elf_phdata, + &arch_state); if (!IS_ERR((void *)elf_entry)) { /* * load_elf_interp() returns relocation @@ -1355,7 +1488,6 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) { u32 __user *header = (u32 __user *) vma->vm_start; u32 word; - mm_segment_t fs = get_fs(); /* * Doing it this way gets the constant folded by GCC. */ @@ -1368,14 +1500,8 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, magic.elfmag[EI_MAG1] = ELFMAG1; magic.elfmag[EI_MAG2] = ELFMAG2; magic.elfmag[EI_MAG3] = ELFMAG3; - /* - * Switch to the user "segment" for get_user(), - * then put back what elf_core_dump() had in place. - */ - set_fs(USER_DS); if (unlikely(get_user(word, header))) word = 0; - set_fs(fs); if (word == magic.cmp) return PAGE_SIZE; } @@ -1556,10 +1682,7 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm) static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, const kernel_siginfo_t *siginfo) { - mm_segment_t old_fs = get_fs(); - set_fs(KERNEL_DS); - copy_siginfo_to_user((user_siginfo_t __user *) csigdata, siginfo); - set_fs(old_fs); + copy_siginfo_to_external(csigdata, siginfo); fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata); } @@ -1733,7 +1856,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, (!regset->active || regset->active(t->task, regset) > 0)) { int ret; size_t size = regset_size(t->task, regset); - void *data = kmalloc(size, GFP_KERNEL); + void *data = kzalloc(size, GFP_KERNEL); if (unlikely(!data)) return 0; ret = regset->get(t->task, regset, @@ -2186,7 +2309,6 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - mm_segment_t fs; int segs, i; size_t vma_data_size = 0; struct vm_area_struct *vma, *gate_vma; @@ -2235,13 +2357,10 @@ static int elf_core_dump(struct coredump_params *cprm) * notes. This also sets up the file header. */ if (!fill_note_info(&elf, e_phnum, &info, cprm->siginfo, cprm->regs)) - goto cleanup; + goto end_coredump; has_dumped = 1; - fs = get_fs(); - set_fs(KERNEL_DS); - offset += sizeof(elf); /* Elf header */ offset += segs * sizeof(struct elf_phdr); /* Program headers */ @@ -2369,9 +2488,6 @@ static int elf_core_dump(struct coredump_params *cprm) } end_coredump: - set_fs(fs); - -cleanup: free_note_info(&info); kfree(shdr4extnum); kvfree(vma_filesz); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 240f66663543..d9501a86cec9 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1549,7 +1549,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) { #define NUM_NOTES 6 int has_dumped = 0; - mm_segment_t fs; int segs; int i; struct vm_area_struct *vma; @@ -1589,31 +1588,31 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) /* alloc memory for large data structures: too large to be on stack */ elf = kmalloc(sizeof(*elf), GFP_KERNEL); if (!elf) - goto cleanup; + goto end_coredump; prstatus = kzalloc(sizeof(*prstatus), GFP_KERNEL); if (!prstatus) - goto cleanup; + goto end_coredump; psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); if (!psinfo) - goto cleanup; + goto end_coredump; notes = kmalloc_array(NUM_NOTES, sizeof(struct memelfnote), GFP_KERNEL); if (!notes) - goto cleanup; + goto end_coredump; fpu = kmalloc(sizeof(*fpu), GFP_KERNEL); if (!fpu) - goto cleanup; + goto end_coredump; #ifdef ELF_CORE_COPY_XFPREGS xfpu = kmalloc(sizeof(*xfpu), GFP_KERNEL); if (!xfpu) - goto cleanup; + goto end_coredump; #endif for (ct = current->mm->core_state->dumper.next; ct; ct = ct->next) { tmp = kzalloc(sizeof(*tmp), GFP_KERNEL); if (!tmp) - goto cleanup; + goto end_coredump; tmp->thread = ct->task; list_add(&tmp->list, &thread_list); @@ -1678,9 +1677,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) "LINUX", ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu); #endif - fs = get_fs(); - set_fs(KERNEL_DS); - offset += sizeof(*elf); /* Elf header */ offset += segs * sizeof(struct elf_phdr); /* Program headers */ @@ -1788,9 +1784,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) } end_coredump: - set_fs(fs); - -cleanup: while (!list_empty(&thread_list)) { struct list_head *tmp = thread_list.next; list_del(tmp); diff --git a/fs/block_dev.c b/fs/block_dev.c index 93672c3f1c78..47860e589388 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -255,7 +255,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, break; if (!(iocb->ki_flags & IOCB_HIPRI) || !blk_poll(bdev_get_queue(bdev), qc, true)) - io_schedule(); + blk_io_schedule(); } __set_current_state(TASK_RUNNING); @@ -449,7 +449,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) if (!(iocb->ki_flags & IOCB_HIPRI) || !blk_poll(bdev_get_queue(bdev), qc, true)) - io_schedule(); + blk_io_schedule(); } __set_current_state(TASK_RUNNING); @@ -614,10 +614,9 @@ static int blkdev_readpage(struct file * file, struct page * page) return block_read_full_page(page, blkdev_get_block); } -static int blkdev_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void blkdev_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block); + mpage_readahead(rac, blkdev_get_block); } static int blkdev_write_begin(struct file *file, struct address_space *mapping, @@ -672,7 +671,7 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) * i_mutex and doing so causes performance issues with concurrent * O_SYNC writers to a block device. */ - error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL); + error = blkdev_issue_flush(bdev, GFP_KERNEL); if (error == -EOPNOTSUPP) error = 0; @@ -713,7 +712,6 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, blk_queue_exit(bdev->bd_queue); return result; } -EXPORT_SYMBOL_GPL(bdev_read_page); /** * bdev_write_page() - Start writing a page to a block device @@ -758,7 +756,6 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, blk_queue_exit(bdev->bd_queue); return result; } -EXPORT_SYMBOL_GPL(bdev_write_page); /* * pseudo-fs @@ -882,21 +879,6 @@ static int bdev_set(struct inode *inode, void *data) static LIST_HEAD(all_bdevs); -/* - * If there is a bdev inode for this device, unhash it so that it gets evicted - * as soon as last inode reference is dropped. - */ -void bdev_unhash_inode(dev_t dev) -{ - struct inode *inode; - - inode = ilookup5(blockdev_superblock, hash(dev), bdev_test, &dev); - if (inode) { - remove_inode_hash(inode); - iput(inode); - } -} - struct block_device *bdget(dev_t dev) { struct block_device *bdev; @@ -1516,7 +1498,7 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) lockdep_assert_held(&bdev->bd_mutex); rescan: - ret = blk_drop_partitions(disk, bdev); + ret = blk_drop_partitions(bdev); if (ret) return ret; @@ -2023,8 +2005,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if (bdev_read_only(I_BDEV(bd_inode))) return -EPERM; - /* uswsusp needs write permission to the swap */ - if (IS_SWAPFILE(bd_inode) && !hibernation_available()) + if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode)) return -ETXTBSY; if (!iov_iter_count(from)) @@ -2085,7 +2066,7 @@ static int blkdev_writepages(struct address_space *mapping, static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, - .readpages = blkdev_readpages, + .readahead = blkdev_readahead, .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, @@ -2183,18 +2164,6 @@ const struct file_operations def_blk_fops = { .fallocate = blkdev_fallocate, }; -int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) -{ - int res; - mm_segment_t old_fs = get_fs(); - set_fs(KERNEL_DS); - res = blkdev_ioctl(bdev, 0, cmd, arg); - set_fs(old_fs); - return res; -} - -EXPORT_SYMBOL(ioctl_by_bdev); - /** * lookup_bdev - lookup a struct block_device by name * @pathname: special file representing the block device diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f8ec2d8606fd..7c6f0bbb54a5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -976,9 +976,7 @@ static void btree_invalidatepage(struct page *page, unsigned int offset, btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info, "page private not zero on page %llu", (unsigned long long)page_offset(page)); - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); + detach_page_private(page); } } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c59e07360083..68c96057ad2d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3099,22 +3099,16 @@ static int submit_extent_page(unsigned int opf, static void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page) { - if (!PagePrivate(page)) { - SetPagePrivate(page); - get_page(page); - set_page_private(page, (unsigned long)eb); - } else { + if (!PagePrivate(page)) + attach_page_private(page, eb); + else WARN_ON(page->private != (unsigned long)eb); - } } void set_page_extent_mapped(struct page *page) { - if (!PagePrivate(page)) { - SetPagePrivate(page); - get_page(page); - set_page_private(page, EXTENT_PAGE_PRIVATE); - } + if (!PagePrivate(page)) + attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); } static struct extent_map * @@ -4390,51 +4384,32 @@ int extent_writepages(struct address_space *mapping, return ret; } -int extent_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages) +void extent_readahead(struct readahead_control *rac) { struct bio *bio = NULL; unsigned long bio_flags = 0; struct page *pagepool[16]; struct extent_map *em_cached = NULL; - int nr = 0; u64 prev_em_start = (u64)-1; + int nr; - while (!list_empty(pages)) { - u64 contig_end = 0; - - for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) { - struct page *page = lru_to_page(pages); - - prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, page->index, - readahead_gfp_mask(mapping))) { - put_page(page); - break; - } - - pagepool[nr++] = page; - contig_end = page_offset(page) + PAGE_SIZE - 1; - } + while ((nr = readahead_page_batch(rac, pagepool))) { + u64 contig_start = page_offset(pagepool[0]); + u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1; - if (nr) { - u64 contig_start = page_offset(pagepool[0]); + ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); - ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); - - contiguous_readpages(pagepool, nr, contig_start, - contig_end, &em_cached, &bio, &bio_flags, - &prev_em_start); - } + contiguous_readpages(pagepool, nr, contig_start, contig_end, + &em_cached, &bio, &bio_flags, &prev_em_start); } if (em_cached) free_extent_map(em_cached); - if (bio) - return submit_one_bio(bio, 0, bio_flags); - return 0; + if (bio) { + if (submit_one_bio(bio, 0, bio_flags)) + return; + } } /* @@ -4952,10 +4927,7 @@ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) * We need to make sure we haven't be attached * to a new eb. */ - ClearPagePrivate(page); - set_page_private(page, 0); - /* One for the page private */ - put_page(page); + detach_page_private(page); } if (mapped) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 9a10681b12bf..602bf3af9fb4 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -201,8 +201,7 @@ int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); -int extent_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages); +void extent_readahead(struct readahead_control *rac); int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); void set_page_extent_mapped(struct page *page); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1242d0aa108d..768c8be4c765 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4904,8 +4904,8 @@ static void evict_inode_truncate_pages(struct inode *inode) /* * Keep looping until we have no more ranges in the io tree. - * We can have ongoing bios started by readpages (called from readahead) - * that have their endio callback (extent_io.c:end_bio_extent_readpage) + * We can have ongoing bios started by readahead that have + * their endio callback (extent_io.c:end_bio_extent_readpage) * still in progress (unlocked the pages in the bio but did not yet * unlocked the ranges in the io tree). Therefore this means some * ranges can still be locked and eviction started because before @@ -7100,11 +7100,11 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, * for it to complete) and then invalidate the pages for * this range (through invalidate_inode_pages2_range()), * but that can lead us to a deadlock with a concurrent - * call to readpages() (a buffered read or a defrag call + * call to readahead (a buffered read or a defrag call * triggered a readahead) on a page lock due to an * ordered dio extent we created before but did not have * yet a corresponding bio submitted (whence it can not - * complete), which makes readpages() wait for that + * complete), which makes readahead wait for that * ordered extent to complete while holding a lock on * that page. */ @@ -7878,21 +7878,16 @@ static int btrfs_writepages(struct address_space *mapping, return extent_writepages(mapping, wbc); } -static int -btrfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void btrfs_readahead(struct readahead_control *rac) { - return extent_readpages(mapping, pages, nr_pages); + extent_readahead(rac); } static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { int ret = try_release_extent_mapping(page, gfp_flags); - if (ret == 1) { - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - } + if (ret == 1) + detach_page_private(page); return ret; } @@ -7914,14 +7909,8 @@ static int btrfs_migratepage(struct address_space *mapping, if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (page_has_private(page)) { - ClearPagePrivate(page); - get_page(newpage); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - SetPagePrivate(newpage); - } + if (page_has_private(page)) + attach_page_private(newpage, detach_page_private(page)); if (PagePrivate2(page)) { ClearPagePrivate2(page); @@ -8043,11 +8032,7 @@ again: } ClearPageChecked(page); - if (PagePrivate(page)) { - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); - } + detach_page_private(page); } /* @@ -10138,7 +10123,7 @@ static const struct address_space_operations btrfs_aops = { .readpage = btrfs_readpage, .writepage = btrfs_writepage, .writepages = btrfs_writepages, - .readpages = btrfs_readpages, + .readahead = btrfs_readahead, .direct_IO = noop_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 0f37660b14b2..d9813a5b075a 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7126,13 +7126,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) goto out; } - if (!access_ok(arg->clone_sources, - sizeof(*arg->clone_sources) * - arg->clone_sources_count)) { - ret = -EFAULT; - goto out; - } - if (arg->flags & ~BTRFS_SEND_FLAG_MASK) { ret = -EINVAL; goto out; diff --git a/fs/buffer.c b/fs/buffer.c index a60f60396cfa..64fe82ec65ff 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -123,14 +123,6 @@ void __wait_on_buffer(struct buffer_head * bh) } EXPORT_SYMBOL(__wait_on_buffer); -static void -__clear_page_buffers(struct page *page) -{ - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); -} - static void buffer_io_error(struct buffer_head *bh, char *msg) { if (!test_bit(BH_Quiet, &bh->b_state)) @@ -906,7 +898,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head) bh = bh->b_this_page; } while (bh); tail->b_this_page = head; - attach_page_buffers(page, head); + attach_page_private(page, head); } static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) @@ -1154,12 +1146,19 @@ EXPORT_SYMBOL(mark_buffer_dirty); void mark_buffer_write_io_error(struct buffer_head *bh) { + struct super_block *sb; + set_buffer_write_io_error(bh); /* FIXME: do we need to set this in both places? */ if (bh->b_page && bh->b_page->mapping) mapping_set_error(bh->b_page->mapping, -EIO); if (bh->b_assoc_map) mapping_set_error(bh->b_assoc_map, -EIO); + rcu_read_lock(); + sb = READ_ONCE(bh->b_bdev->bd_super); + if (sb) + errseq_set(&sb->s_wb_err, -EIO); + rcu_read_unlock(); } EXPORT_SYMBOL(mark_buffer_write_io_error); @@ -1580,7 +1579,7 @@ void create_empty_buffers(struct page *page, bh = bh->b_this_page; } while (bh != head); } - attach_page_buffers(page, head); + attach_page_private(page, head); spin_unlock(&page->mapping->private_lock); } EXPORT_SYMBOL(create_empty_buffers); @@ -2567,7 +2566,7 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head) bh->b_this_page = head; bh = bh->b_this_page; } while (bh != head); - attach_page_buffers(page, head); + attach_page_private(page, head); spin_unlock(&page->mapping->private_lock); } @@ -3227,7 +3226,7 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free) bh = next; } while (bh != head); *buffers_to_free = head; - __clear_page_buffers(page); + detach_page_private(page); return 1; failed: return 0; diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig index ae559ed5b3b3..ff9ca55a9ae9 100644 --- a/fs/cachefiles/Kconfig +++ b/fs/cachefiles/Kconfig @@ -8,7 +8,7 @@ config CACHEFILES filesystems - primarily networking filesystems - thus allowing fast local disk to enhance the speed of slower devices. - See Documentation/filesystems/caching/cachefiles.txt for more + See Documentation/filesystems/caching/cachefiles.rst for more information. config CACHEFILES_DEBUG @@ -36,5 +36,5 @@ config CACHEFILES_HISTOGRAM bouncing between CPUs. On the other hand, the histogram may be useful for debugging purposes. Saying 'N' here is recommended. - See Documentation/filesystems/caching/cachefiles.txt for more + See Documentation/filesystems/caching/cachefiles.rst for more information. diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 5f3aa4d607de..f1acde6fb9a6 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3991,7 +3991,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, __ceph_queue_cap_release(session, cap); spin_unlock(&session->s_cap_lock); } - goto done; + goto flush_cap_releases; } /* these will work even if we don't have a cap yet */ diff --git a/fs/char_dev.c b/fs/char_dev.c index c5e6eff5a381..ba0ded7842a7 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -483,6 +483,9 @@ int cdev_add(struct cdev *p, dev_t dev, unsigned count) p->dev = dev; p->count = count; + if (WARN_ON(dev == WHITEOUT_DEV)) + return -EBUSY; + error = kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p); if (error) diff --git a/fs/coda/Kconfig b/fs/coda/Kconfig index ae6759f9594a..c3477eeafb3f 100644 --- a/fs/coda/Kconfig +++ b/fs/coda/Kconfig @@ -15,7 +15,7 @@ config CODA_FS *client*. You will need user level code as well, both for the client and server. Servers are currently user level, i.e. they need no kernel support. Please read - <file:Documentation/filesystems/coda.txt> and check out the Coda + <file:Documentation/filesystems/coda.rst> and check out the Coda home page <http://www.coda.cs.cmu.edu/>. To compile the coda client support as a module, choose M here: the diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index aaad4ca1217e..e61f3fe8e32a 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -17,6 +17,8 @@ #include <linux/elfcore-compat.h> #include <linux/time.h> +#define ELF_COMPAT 1 + /* * Rename the basic ELF layout types to refer to the 32-bit class of files. */ @@ -28,18 +30,20 @@ #undef elf_shdr #undef elf_note #undef elf_addr_t +#undef ELF_GNU_PROPERTY_ALIGN #define elfhdr elf32_hdr #define elf_phdr elf32_phdr #define elf_shdr elf32_shdr #define elf_note elf32_note #define elf_addr_t Elf32_Addr +#define ELF_GNU_PROPERTY_ALIGN ELF32_GNU_PROPERTY_ALIGN /* * Some data types as stored in coredump. */ #define user_long_t compat_long_t #define user_siginfo_t compat_siginfo_t -#define copy_siginfo_to_user copy_siginfo_to_user32 +#define copy_siginfo_to_external copy_siginfo_to_external32 /* * The machine-dependent core note format types are defined in elfcore-compat.h, diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index fd0b5dd68f9e..8bd6a883c94c 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -9,7 +9,7 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please see Documentation/filesystems/configfs/configfs.txt for more + * Please see Documentation/filesystems/configfs.rst for more * information. */ diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 6e0f1fcb8a5b..704a4356f137 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -9,7 +9,7 @@ * * configfs Copyright (C) 2005 Oracle. All rights reserved. * - * Please see the file Documentation/filesystems/configfs/configfs.txt for + * Please see the file Documentation/filesystems/configfs.rst for * critical information about using the config_item interface. */ diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig index c8bebb70a971..d98cef0dbb6b 100644 --- a/fs/cramfs/Kconfig +++ b/fs/cramfs/Kconfig @@ -9,7 +9,7 @@ config CRAMFS limited to 256MB file systems (with 16MB files), and doesn't support 16/32 bits uid/gid, hard links and timestamps. - See <file:Documentation/filesystems/cramfs.txt> and + See <file:Documentation/filesystems/cramfs.rst> and <file:fs/cramfs/README> for further information. To compile this as a module, choose M here: the module will be called diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 1ecaac7ee3cb..ed015cb66c7c 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -54,6 +54,7 @@ struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags) /** * fscrypt_free_bounce_page() - free a ciphertext bounce page + * @bounce_page: the bounce page to free, or NULL * * Free a bounce page that was allocated by fscrypt_encrypt_pagecache_blocks(), * or by fscrypt_alloc_bounce_page() directly. @@ -76,8 +77,12 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, memset(iv, 0, ci->ci_mode->ivsize); if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { - WARN_ON_ONCE((u32)lblk_num != lblk_num); + WARN_ON_ONCE(lblk_num > U32_MAX); + WARN_ON_ONCE(ci->ci_inode->i_ino > U32_MAX); lblk_num |= (u64)ci->ci_inode->i_ino << 32; + } else if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) { + WARN_ON_ONCE(lblk_num > U32_MAX); + lblk_num = (u32)(ci->ci_hashed_ino + lblk_num); } else if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE); } @@ -132,7 +137,8 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, } /** - * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a pagecache page + * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a + * pagecache page * @page: The locked pagecache page containing the block(s) to encrypt * @len: Total size of the block(s) to encrypt. Must be a nonzero * multiple of the filesystem's block size. @@ -222,7 +228,8 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); /** - * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a pagecache page + * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a + * pagecache page * @page: The locked pagecache page containing the block(s) to decrypt * @len: Total size of the block(s) to decrypt. Must be a nonzero * multiple of the filesystem's block size. @@ -346,6 +353,8 @@ void fscrypt_msg(const struct inode *inode, const char *level, /** * fscrypt_init() - Set up for fs encryption. + * + * Return: 0 on success; -errno on failure */ static int __init fscrypt_init(void) { diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 4c212442a8f7..83ca5f1e7934 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -18,7 +18,7 @@ #include <crypto/skcipher.h> #include "fscrypt_private.h" -/** +/* * struct fscrypt_nokey_name - identifier for directory entry when key is absent * * When userspace lists an encrypted directory without access to the key, the @@ -83,13 +83,8 @@ static int fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result) tfm = prev_tfm; } } - { - SHASH_DESC_ON_STACK(desc, tfm); - desc->tfm = tfm; - - return crypto_shash_digest(desc, data, data_len, result); - } + return crypto_shash_tfm_digest(tfm, data, data_len, result); } static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) @@ -105,9 +100,12 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) /** * fscrypt_fname_encrypt() - encrypt a filename - * - * The output buffer must be at least as large as the input buffer. - * Any extra space is filled with NUL padding before encryption. + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @iname: the filename to encrypt + * @out: (output) the encrypted filename + * @olen: size of the encrypted filename. It must be at least @iname->len. + * Any extra space is filled with NUL padding before encryption. * * Return: 0 on success, -errno on failure */ @@ -157,8 +155,11 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, /** * fname_decrypt() - decrypt a filename - * - * The caller must have allocated sufficient memory for the @oname string. + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @iname: the encrypted filename to decrypt + * @oname: (output) the decrypted filename. The caller must have allocated + * enough space for this, e.g. using fscrypt_fname_alloc_buffer(). * * Return: 0 on success, -errno on failure */ @@ -206,7 +207,10 @@ static const char lookup_table[65] = #define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) /** - * base64_encode() - + * base64_encode() - base64-encode some bytes + * @src: the bytes to encode + * @len: number of bytes to encode + * @dst: (output) the base64-encoded string. Not NUL-terminated. * * Encodes the input string using characters from the set [A-Za-z0-9+,]. * The encoded string is roughly 4/3 times the size of the input string. @@ -272,7 +276,12 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, } /** - * fscrypt_fname_alloc_buffer - allocate a buffer for presented filenames + * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @max_encrypted_len: maximum length of encrypted filenames the buffer will be + * used to present + * @crypto_str: (output) buffer to allocate * * Allocate a buffer that is large enough to hold any decrypted or encoded * filename (null-terminated), for the given maximum encrypted filename length. @@ -297,9 +306,10 @@ int fscrypt_fname_alloc_buffer(const struct inode *inode, EXPORT_SYMBOL(fscrypt_fname_alloc_buffer); /** - * fscrypt_fname_free_buffer - free the buffer for presented filenames + * fscrypt_fname_free_buffer() - free a buffer for presented filenames + * @crypto_str: the buffer to free * - * Free the buffer allocated by fscrypt_fname_alloc_buffer(). + * Free a buffer that was allocated by fscrypt_fname_alloc_buffer(). */ void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { @@ -311,10 +321,19 @@ void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** - * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user - * space - * - * The caller must have allocated sufficient memory for the @oname string. + * fscrypt_fname_disk_to_usr() - convert an encrypted filename to + * user-presentable form + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @hash: first part of the name's dirhash, if applicable. This only needs to + * be provided if the filename is located in an indexed directory whose + * encryption key may be unavailable. Not needed for symlink targets. + * @minor_hash: second part of the name's dirhash, if applicable + * @iname: encrypted filename to convert. May also be "." or "..", which + * aren't actually encrypted. + * @oname: output buffer for the user-presentable filename. The caller must + * have allocated enough space for this, e.g. using + * fscrypt_fname_alloc_buffer(). * * If the key is available, we'll decrypt the disk name. Otherwise, we'll * encode it for presentation in fscrypt_nokey_name format. diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index dbced2937ec8..eb7fcd2b7fb8 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -43,7 +43,7 @@ struct fscrypt_context_v2 { u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; }; -/** +/* * fscrypt_context - the encryption context of an inode * * This is the on-disk equivalent of an fscrypt_policy, stored alongside each @@ -157,7 +157,7 @@ fscrypt_policy_flags(const union fscrypt_policy *policy) BUG(); } -/** +/* * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. */ @@ -222,6 +222,9 @@ struct fscrypt_info { /* This inode's nonce, copied from the fscrypt_context */ u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE]; + + /* Hashed inode number. Only set for IV_INO_LBLK_32 */ + u32 ci_hashed_ino; }; typedef enum { @@ -231,15 +234,14 @@ typedef enum { /* crypto.c */ extern struct kmem_cache *fscrypt_info_cachep; -extern int fscrypt_initialize(unsigned int cop_flags); -extern int fscrypt_crypt_block(const struct inode *inode, - fscrypt_direction_t rw, u64 lblk_num, - struct page *src_page, struct page *dest_page, - unsigned int len, unsigned int offs, - gfp_t gfp_flags); -extern struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); - -extern void __printf(3, 4) __cold +int fscrypt_initialize(unsigned int cop_flags); +int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, + u64 lblk_num, struct page *src_page, + struct page *dest_page, unsigned int len, + unsigned int offs, gfp_t gfp_flags); +struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); + +void __printf(3, 4) __cold fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...); #define fscrypt_warn(inode, fmt, ...) \ @@ -264,12 +266,10 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, const struct fscrypt_info *ci); /* fname.c */ -extern int fscrypt_fname_encrypt(const struct inode *inode, - const struct qstr *iname, - u8 *out, unsigned int olen); -extern bool fscrypt_fname_encrypted_size(const struct inode *inode, - u32 orig_len, u32 max_len, - u32 *encrypted_len_ret); +int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, + u8 *out, unsigned int olen); +bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, + u32 max_len, u32 *encrypted_len_ret); extern const struct dentry_operations fscrypt_d_ops; /* hkdf.c */ @@ -278,8 +278,8 @@ struct fscrypt_hkdf { struct crypto_shash *hmac_tfm; }; -extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, - unsigned int master_key_size); +int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, + unsigned int master_key_size); /* * The list of contexts in which fscrypt uses HKDF. These values are used as @@ -293,12 +293,14 @@ extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, #define HKDF_CONTEXT_DIRECT_KEY 3 #define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 #define HKDF_CONTEXT_DIRHASH_KEY 5 +#define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6 +#define HKDF_CONTEXT_INODE_HASH_KEY 7 -extern int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, - const u8 *info, unsigned int infolen, - u8 *okm, unsigned int okmlen); +int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen); -extern void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); +void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); /* keyring.c */ @@ -389,14 +391,17 @@ struct fscrypt_master_key { struct list_head mk_decrypted_inodes; spinlock_t mk_decrypted_inodes_lock; - /* Crypto API transforms for DIRECT_KEY policies, allocated on-demand */ - struct crypto_skcipher *mk_direct_tfms[__FSCRYPT_MODE_MAX + 1]; - /* - * Crypto API transforms for filesystem-layer implementation of - * IV_INO_LBLK_64 policies, allocated on-demand. + * Per-mode encryption keys for the various types of encryption policies + * that use them. Allocated and derived on-demand. */ - struct crypto_skcipher *mk_iv_ino_lblk_64_tfms[__FSCRYPT_MODE_MAX + 1]; + struct crypto_skcipher *mk_direct_keys[__FSCRYPT_MODE_MAX + 1]; + struct crypto_skcipher *mk_iv_ino_lblk_64_keys[__FSCRYPT_MODE_MAX + 1]; + struct crypto_skcipher *mk_iv_ino_lblk_32_keys[__FSCRYPT_MODE_MAX + 1]; + + /* Hash key for inode numbers. Initialized only when needed. */ + siphash_key_t mk_ino_hash_key; + bool mk_ino_hash_key_initialized; } __randomize_layout; @@ -436,14 +441,17 @@ static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec) return 0; } -extern struct key * +struct key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); -extern int fscrypt_verify_key_added(struct super_block *sb, - const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); +int fscrypt_add_test_dummy_key(struct super_block *sb, + struct fscrypt_key_specifier *key_spec); + +int fscrypt_verify_key_added(struct super_block *sb, + const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); -extern int __init fscrypt_init_keyring(void); +int __init fscrypt_init_keyring(void); /* keysetup.c */ @@ -457,33 +465,32 @@ struct fscrypt_mode { extern struct fscrypt_mode fscrypt_modes[]; -extern struct crypto_skcipher * -fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, - const struct inode *inode); +struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode, + const u8 *raw_key, + const struct inode *inode); -extern int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, - const u8 *raw_key); +int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key); -extern int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, - const struct fscrypt_master_key *mk); +int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, + const struct fscrypt_master_key *mk); /* keysetup_v1.c */ -extern void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); +void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); + +int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, + const u8 *raw_master_key); -extern int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, - const u8 *raw_master_key); +int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci); -extern int fscrypt_setup_v1_file_key_via_subscribed_keyrings( - struct fscrypt_info *ci); /* policy.c */ -extern bool fscrypt_policies_equal(const union fscrypt_policy *policy1, - const union fscrypt_policy *policy2); -extern bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, - const struct inode *inode); -extern int fscrypt_policy_from_context(union fscrypt_policy *policy_u, - const union fscrypt_context *ctx_u, - int ctx_size); +bool fscrypt_policies_equal(const union fscrypt_policy *policy1, + const union fscrypt_policy *policy2); +bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, + const struct inode *inode); +int fscrypt_policy_from_context(union fscrypt_policy *policy_u, + const union fscrypt_context *ctx_u, + int ctx_size); #endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index efb95bd19a89..0cba7928446d 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -44,17 +44,13 @@ static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, unsigned int ikmlen, u8 prk[HKDF_HASHLEN]) { static const u8 default_salt[HKDF_HASHLEN]; - SHASH_DESC_ON_STACK(desc, hmac_tfm); int err; err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN); if (err) return err; - desc->tfm = hmac_tfm; - err = crypto_shash_digest(desc, ikm, ikmlen, prk); - shash_desc_zero(desc); - return err; + return crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk); } /* diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 5ef861742921..09fb8aa0f2e9 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -10,7 +10,7 @@ #include "fscrypt_private.h" /** - * fscrypt_file_open - prepare to open a possibly-encrypted regular file + * fscrypt_file_open() - prepare to open a possibly-encrypted regular file * @inode: the inode being opened * @filp: the struct file being set up * @@ -262,7 +262,7 @@ err_free_sd: EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink); /** - * fscrypt_get_symlink - get the target of an encrypted symlink + * fscrypt_get_symlink() - get the target of an encrypted symlink * @inode: the symlink inode * @caddr: the on-disk contents of the symlink * @max_size: size of @caddr buffer diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index ab41b25d4fa1..e24eb48bfbe1 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -20,6 +20,7 @@ #include <crypto/skcipher.h> #include <linux/key-type.h> +#include <linux/random.h> #include <linux/seq_file.h> #include "fscrypt_private.h" @@ -44,8 +45,9 @@ static void free_master_key(struct fscrypt_master_key *mk) wipe_master_key_secret(&mk->mk_secret); for (i = 0; i <= __FSCRYPT_MODE_MAX; i++) { - crypto_free_skcipher(mk->mk_direct_tfms[i]); - crypto_free_skcipher(mk->mk_iv_ino_lblk_64_tfms[i]); + crypto_free_skcipher(mk->mk_direct_keys[i]); + crypto_free_skcipher(mk->mk_iv_ino_lblk_64_keys[i]); + crypto_free_skcipher(mk->mk_iv_ino_lblk_32_keys[i]); } key_put(mk->mk_users); @@ -424,9 +426,9 @@ static int add_existing_master_key(struct fscrypt_master_key *mk, return 0; } -static int add_master_key(struct super_block *sb, - struct fscrypt_master_key_secret *secret, - const struct fscrypt_key_specifier *mk_spec) +static int do_add_master_key(struct super_block *sb, + struct fscrypt_master_key_secret *secret, + const struct fscrypt_key_specifier *mk_spec) { static DEFINE_MUTEX(fscrypt_add_key_mutex); struct key *key; @@ -465,6 +467,35 @@ out_unlock: return err; } +static int add_master_key(struct super_block *sb, + struct fscrypt_master_key_secret *secret, + struct fscrypt_key_specifier *key_spec) +{ + int err; + + if (key_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { + err = fscrypt_init_hkdf(&secret->hkdf, secret->raw, + secret->size); + if (err) + return err; + + /* + * Now that the HKDF context is initialized, the raw key is no + * longer needed. + */ + memzero_explicit(secret->raw, secret->size); + + /* Calculate the key identifier */ + err = fscrypt_hkdf_expand(&secret->hkdf, + HKDF_CONTEXT_KEY_IDENTIFIER, NULL, 0, + key_spec->u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); + if (err) + return err; + } + return do_add_master_key(sb, secret, key_spec); +} + static int fscrypt_provisioning_key_preparse(struct key_preparsed_payload *prep) { const struct fscrypt_provisioning_key_payload *payload = prep->data; @@ -609,6 +640,15 @@ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg) if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) return -EINVAL; + /* + * Only root can add keys that are identified by an arbitrary descriptor + * rather than by a cryptographic hash --- since otherwise a malicious + * user could add the wrong key. + */ + if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && + !capable(CAP_SYS_ADMIN)) + return -EACCES; + memset(&secret, 0, sizeof(secret)); if (arg.key_id) { if (arg.raw_size != 0) @@ -626,48 +666,17 @@ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg) goto out_wipe_secret; } - switch (arg.key_spec.type) { - case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: - /* - * Only root can add keys that are identified by an arbitrary - * descriptor rather than by a cryptographic hash --- since - * otherwise a malicious user could add the wrong key. - */ - err = -EACCES; - if (!capable(CAP_SYS_ADMIN)) - goto out_wipe_secret; - break; - case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: - err = fscrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size); - if (err) - goto out_wipe_secret; - - /* - * Now that the HKDF context is initialized, the raw key is no - * longer needed. - */ - memzero_explicit(secret.raw, secret.size); - - /* Calculate the key identifier and return it to userspace. */ - err = fscrypt_hkdf_expand(&secret.hkdf, - HKDF_CONTEXT_KEY_IDENTIFIER, - NULL, 0, arg.key_spec.u.identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE); - if (err) - goto out_wipe_secret; - err = -EFAULT; - if (copy_to_user(uarg->key_spec.u.identifier, - arg.key_spec.u.identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE)) - goto out_wipe_secret; - break; - default: - WARN_ON(1); - err = -EINVAL; + err = add_master_key(sb, &secret, &arg.key_spec); + if (err) goto out_wipe_secret; - } - err = add_master_key(sb, &secret, &arg.key_spec); + /* Return the key identifier to userspace, if applicable */ + err = -EFAULT; + if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER && + copy_to_user(uarg->key_spec.u.identifier, arg.key_spec.u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE)) + goto out_wipe_secret; + err = 0; out_wipe_secret: wipe_master_key_secret(&secret); return err; @@ -675,6 +684,29 @@ out_wipe_secret: EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key); /* + * Add the key for '-o test_dummy_encryption' to the filesystem keyring. + * + * Use a per-boot random key to prevent people from misusing this option. + */ +int fscrypt_add_test_dummy_key(struct super_block *sb, + struct fscrypt_key_specifier *key_spec) +{ + static u8 test_key[FSCRYPT_MAX_KEY_SIZE]; + struct fscrypt_master_key_secret secret; + int err; + + get_random_once(test_key, FSCRYPT_MAX_KEY_SIZE); + + memset(&secret, 0, sizeof(secret)); + secret.size = FSCRYPT_MAX_KEY_SIZE; + memcpy(secret.raw, test_key, FSCRYPT_MAX_KEY_SIZE); + + err = add_master_key(sb, &secret, key_spec); + wipe_master_key_secret(&secret); + return err; +} + +/* * Verify that the current user has added a master key with the given identifier * (returns -ENOKEY if not). This is needed to prevent a user from encrypting * their files using some other user's key which they don't actually know. diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 302375e9f719..1129adfa097d 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -46,6 +46,8 @@ struct fscrypt_mode fscrypt_modes[] = { }, }; +static DEFINE_MUTEX(fscrypt_mode_key_setup_mutex); + static struct fscrypt_mode * select_encryption_mode(const union fscrypt_policy *policy, const struct inode *inode) @@ -130,7 +132,7 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, const struct super_block *sb = inode->i_sb; struct fscrypt_mode *mode = ci->ci_mode; const u8 mode_num = mode - fscrypt_modes; - struct crypto_skcipher *tfm, *prev_tfm; + struct crypto_skcipher *tfm; u8 mode_key[FSCRYPT_MAX_KEY_SIZE]; u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)]; unsigned int hkdf_infolen = 0; @@ -139,10 +141,17 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, if (WARN_ON(mode_num > __FSCRYPT_MODE_MAX)) return -EINVAL; - /* pairs with cmpxchg() below */ + /* pairs with smp_store_release() below */ tfm = READ_ONCE(tfms[mode_num]); - if (likely(tfm != NULL)) - goto done; + if (likely(tfm != NULL)) { + ci->ci_ctfm = tfm; + return 0; + } + + mutex_lock(&fscrypt_mode_key_setup_mutex); + + if (tfms[mode_num]) + goto done_unlock; BUILD_BUG_ON(sizeof(mode_num) != 1); BUILD_BUG_ON(sizeof(sb->s_uuid) != 16); @@ -157,21 +166,21 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, hkdf_context, hkdf_info, hkdf_infolen, mode_key, mode->keysize); if (err) - return err; + goto out_unlock; tfm = fscrypt_allocate_skcipher(mode, mode_key, inode); memzero_explicit(mode_key, mode->keysize); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - /* pairs with READ_ONCE() above */ - prev_tfm = cmpxchg(&tfms[mode_num], NULL, tfm); - if (prev_tfm != NULL) { - crypto_free_skcipher(tfm); - tfm = prev_tfm; + if (IS_ERR(tfm)) { + err = PTR_ERR(tfm); + goto out_unlock; } -done: + /* pairs with READ_ONCE() above */ + smp_store_release(&tfms[mode_num], tfm); +done_unlock: ci->ci_ctfm = tfm; - return 0; + err = 0; +out_unlock: + mutex_unlock(&fscrypt_mode_key_setup_mutex); + return err; } int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, @@ -189,6 +198,43 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, return 0; } +static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci, + struct fscrypt_master_key *mk) +{ + int err; + + err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_32_keys, + HKDF_CONTEXT_IV_INO_LBLK_32_KEY, true); + if (err) + return err; + + /* pairs with smp_store_release() below */ + if (!smp_load_acquire(&mk->mk_ino_hash_key_initialized)) { + + mutex_lock(&fscrypt_mode_key_setup_mutex); + + if (mk->mk_ino_hash_key_initialized) + goto unlock; + + err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, + HKDF_CONTEXT_INODE_HASH_KEY, NULL, 0, + (u8 *)&mk->mk_ino_hash_key, + sizeof(mk->mk_ino_hash_key)); + if (err) + goto unlock; + /* pairs with smp_load_acquire() above */ + smp_store_release(&mk->mk_ino_hash_key_initialized, true); +unlock: + mutex_unlock(&fscrypt_mode_key_setup_mutex); + if (err) + return err; + } + + ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino, + &mk->mk_ino_hash_key); + return 0; +} + static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, struct fscrypt_master_key *mk) { @@ -203,7 +249,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, * encryption key. This ensures that the master key is * consistently used only for HKDF, avoiding key reuse issues. */ - err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_tfms, + err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_keys, HKDF_CONTEXT_DIRECT_KEY, false); } else if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { @@ -211,11 +257,14 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, * IV_INO_LBLK_64: encryption keys are derived from (master_key, * mode_num, filesystem_uuid), and inode number is included in * the IVs. This format is optimized for use with inline - * encryption hardware compliant with the UFS or eMMC standards. + * encryption hardware compliant with the UFS standard. */ - err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_tfms, + err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_keys, HKDF_CONTEXT_IV_INO_LBLK_64_KEY, true); + } else if (ci->ci_policy.v2.flags & + FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) { + err = fscrypt_setup_iv_ino_lblk_32_key(ci, mk); } else { u8 derived_key[FSCRYPT_MAX_KEY_SIZE]; @@ -395,21 +444,18 @@ int fscrypt_get_encryption_info(struct inode *inode) res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { - if (!fscrypt_dummy_context_enabled(inode) || - IS_ENCRYPTED(inode)) { + const union fscrypt_context *dummy_ctx = + fscrypt_get_dummy_context(inode->i_sb); + + if (IS_ENCRYPTED(inode) || !dummy_ctx) { fscrypt_warn(inode, "Error %d getting encryption context", res); return res; } /* Fake up a context for an unencrypted directory */ - memset(&ctx, 0, sizeof(ctx)); - ctx.version = FSCRYPT_CONTEXT_V1; - ctx.v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; - ctx.v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - memset(ctx.v1.master_key_descriptor, 0x42, - FSCRYPT_KEY_DESCRIPTOR_SIZE); - res = sizeof(ctx.v1); + res = fscrypt_context_size(dummy_ctx); + memcpy(&ctx, dummy_ctx, res); } crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS); @@ -475,7 +521,8 @@ out: EXPORT_SYMBOL(fscrypt_get_encryption_info); /** - * fscrypt_put_encryption_info - free most of an inode's fscrypt data + * fscrypt_put_encryption_info() - free most of an inode's fscrypt data + * @inode: an inode being evicted * * Free the inode's fscrypt_info. Filesystems must call this when the inode is * being evicted. An RCU grace period need not have elapsed yet. @@ -488,7 +535,8 @@ void fscrypt_put_encryption_info(struct inode *inode) EXPORT_SYMBOL(fscrypt_put_encryption_info); /** - * fscrypt_free_inode - free an inode's fscrypt data requiring RCU delay + * fscrypt_free_inode() - free an inode's fscrypt data requiring RCU delay + * @inode: an inode being freed * * Free the inode's cached decrypted symlink target, if any. Filesystems must * call this after an RCU grace period, just before they free the inode. @@ -503,7 +551,8 @@ void fscrypt_free_inode(struct inode *inode) EXPORT_SYMBOL(fscrypt_free_inode); /** - * fscrypt_drop_inode - check whether the inode's master key has been removed + * fscrypt_drop_inode() - check whether the inode's master key has been removed + * @inode: an inode being considered for eviction * * Filesystems supporting fscrypt must call this from their ->drop_inode() * method so that encrypted inodes are evicted as soon as they're no longer in diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 10ccf945020c..d23ff162c78b 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -11,12 +11,15 @@ */ #include <linux/random.h> +#include <linux/seq_file.h> #include <linux/string.h> #include <linux/mount.h> #include "fscrypt_private.h" /** - * fscrypt_policies_equal - check whether two encryption policies are the same + * fscrypt_policies_equal() - check whether two encryption policies are the same + * @policy1: the first policy + * @policy2: the second policy * * Return: %true if equal, else %false */ @@ -66,18 +69,14 @@ static bool supported_direct_key_modes(const struct inode *inode, return true; } -static bool supported_iv_ino_lblk_64_policy( - const struct fscrypt_policy_v2 *policy, - const struct inode *inode) +static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy, + const struct inode *inode, + const char *type, + int max_ino_bits, int max_lblk_bits) { struct super_block *sb = inode->i_sb; int ino_bits = 64, lblk_bits = 64; - if (policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { - fscrypt_warn(inode, - "The DIRECT_KEY and IV_INO_LBLK_64 flags are mutually exclusive"); - return false; - } /* * It's unsafe to include inode numbers in the IVs if the filesystem can * potentially renumber inodes, e.g. via filesystem shrinking. @@ -85,16 +84,22 @@ static bool supported_iv_ino_lblk_64_policy( if (!sb->s_cop->has_stable_inodes || !sb->s_cop->has_stable_inodes(sb)) { fscrypt_warn(inode, - "Can't use IV_INO_LBLK_64 policy on filesystem '%s' because it doesn't have stable inode numbers", - sb->s_id); + "Can't use %s policy on filesystem '%s' because it doesn't have stable inode numbers", + type, sb->s_id); return false; } if (sb->s_cop->get_ino_and_lblk_bits) sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits); - if (ino_bits > 32 || lblk_bits > 32) { + if (ino_bits > max_ino_bits) { + fscrypt_warn(inode, + "Can't use %s policy on filesystem '%s' because its inode numbers are too long", + type, sb->s_id); + return false; + } + if (lblk_bits > max_lblk_bits) { fscrypt_warn(inode, - "Can't use IV_INO_LBLK_64 policy on filesystem '%s' because it doesn't use 32-bit inode and block numbers", - sb->s_id); + "Can't use %s policy on filesystem '%s' because its block numbers are too long", + type, sb->s_id); return false; } return true; @@ -137,6 +142,8 @@ static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy, static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, const struct inode *inode) { + int count = 0; + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, policy->filenames_encryption_mode)) { fscrypt_warn(inode, @@ -152,13 +159,29 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, return false; } + count += !!(policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY); + count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64); + count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32); + if (count > 1) { + fscrypt_warn(inode, "Mutually exclusive encryption flags (0x%02x)", + policy->flags); + return false; + } + if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) && !supported_direct_key_modes(inode, policy->contents_encryption_mode, policy->filenames_encryption_mode)) return false; if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) && - !supported_iv_ino_lblk_64_policy(policy, inode)) + !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_64", + 32, 32)) + return false; + + if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) && + /* This uses hashed inode numbers, so ino_bits doesn't matter. */ + !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32", + INT_MAX, 32)) return false; if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) { @@ -170,7 +193,9 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, } /** - * fscrypt_supported_policy - check whether an encryption policy is supported + * fscrypt_supported_policy() - check whether an encryption policy is supported + * @policy_u: the encryption policy + * @inode: the inode on which the policy will be used * * Given an encryption policy, check whether all its encryption modes and other * settings are supported by this kernel on the given inode. (But we don't @@ -192,7 +217,10 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, } /** - * fscrypt_new_context_from_policy - create a new fscrypt_context from a policy + * fscrypt_new_context_from_policy() - create a new fscrypt_context from + * an fscrypt_policy + * @ctx_u: output context + * @policy_u: input policy * * Create an fscrypt_context for an inode that is being assigned the given * encryption policy. A new nonce is randomly generated. @@ -242,7 +270,11 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u, } /** - * fscrypt_policy_from_context - convert an fscrypt_context to an fscrypt_policy + * fscrypt_policy_from_context() - convert an fscrypt_context to + * an fscrypt_policy + * @policy_u: output policy + * @ctx_u: input context + * @ctx_size: size of input context in bytes * * Given an fscrypt_context, build the corresponding fscrypt_policy. * @@ -354,6 +386,9 @@ static int set_encryption_policy(struct inode *inode, policy->v2.master_key_identifier); if (err) return err; + if (policy->v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) + pr_warn_once("%s (pid %d) is setting an IV_INO_LBLK_32 encryption policy. This should only be used if there are certain hardware limitations.\n", + current->comm, current->pid); break; default: WARN_ON(1); @@ -605,3 +640,127 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, return preload ? fscrypt_get_encryption_info(child): 0; } EXPORT_SYMBOL(fscrypt_inherit_context); + +/** + * fscrypt_set_test_dummy_encryption() - handle '-o test_dummy_encryption' + * @sb: the filesystem on which test_dummy_encryption is being specified + * @arg: the argument to the test_dummy_encryption option. + * If no argument was specified, then @arg->from == NULL. + * @dummy_ctx: the filesystem's current dummy context (input/output, see below) + * + * Handle the test_dummy_encryption mount option by creating a dummy encryption + * context, saving it in @dummy_ctx, and adding the corresponding dummy + * encryption key to the filesystem. If the @dummy_ctx is already set, then + * instead validate that it matches @arg. Don't support changing it via + * remount, as that is difficult to do safely. + * + * The reason we use an fscrypt_context rather than an fscrypt_policy is because + * we mustn't generate a new nonce each time we access a dummy-encrypted + * directory, as that would change the way filenames are encrypted. + * + * Return: 0 on success (dummy context set, or the same context is already set); + * -EEXIST if a different dummy context is already set; + * or another -errno value. + */ +int fscrypt_set_test_dummy_encryption(struct super_block *sb, + const substring_t *arg, + struct fscrypt_dummy_context *dummy_ctx) +{ + const char *argstr = "v2"; + const char *argstr_to_free = NULL; + struct fscrypt_key_specifier key_spec = { 0 }; + int version; + union fscrypt_context *ctx = NULL; + int err; + + if (arg->from) { + argstr = argstr_to_free = match_strdup(arg); + if (!argstr) + return -ENOMEM; + } + + if (!strcmp(argstr, "v1")) { + version = FSCRYPT_CONTEXT_V1; + key_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; + memset(key_spec.u.descriptor, 0x42, + FSCRYPT_KEY_DESCRIPTOR_SIZE); + } else if (!strcmp(argstr, "v2")) { + version = FSCRYPT_CONTEXT_V2; + key_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; + /* key_spec.u.identifier gets filled in when adding the key */ + } else { + err = -EINVAL; + goto out; + } + + if (dummy_ctx->ctx) { + /* + * Note: if we ever make test_dummy_encryption support + * specifying other encryption settings, such as the encryption + * modes, we'll need to compare those settings here. + */ + if (dummy_ctx->ctx->version == version) + err = 0; + else + err = -EEXIST; + goto out; + } + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + err = -ENOMEM; + goto out; + } + + err = fscrypt_add_test_dummy_key(sb, &key_spec); + if (err) + goto out; + + ctx->version = version; + switch (ctx->version) { + case FSCRYPT_CONTEXT_V1: + ctx->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; + ctx->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; + memcpy(ctx->v1.master_key_descriptor, key_spec.u.descriptor, + FSCRYPT_KEY_DESCRIPTOR_SIZE); + break; + case FSCRYPT_CONTEXT_V2: + ctx->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; + ctx->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; + memcpy(ctx->v2.master_key_identifier, key_spec.u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); + break; + default: + WARN_ON(1); + err = -EINVAL; + goto out; + } + dummy_ctx->ctx = ctx; + ctx = NULL; + err = 0; +out: + kfree(ctx); + kfree(argstr_to_free); + return err; +} +EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption); + +/** + * fscrypt_show_test_dummy_encryption() - show '-o test_dummy_encryption' + * @seq: the seq_file to print the option to + * @sep: the separator character to use + * @sb: the filesystem whose options are being shown + * + * Show the test_dummy_encryption mount option, if it was specified. + * This is mainly used for /proc/mounts. + */ +void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, + struct super_block *sb) +{ + const union fscrypt_context *ctx = fscrypt_get_dummy_context(sb); + + if (!ctx) + return; + seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, ctx->version); +} +EXPORT_SYMBOL_GPL(fscrypt_show_test_dummy_encryption); diff --git a/fs/dcache.c b/fs/dcache.c index b280e07e162b..0d07fb335b78 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -647,6 +647,10 @@ static inline bool retain_dentry(struct dentry *dentry) if (dentry->d_op->d_delete(dentry)) return false; } + + if (unlikely(dentry->d_flags & DCACHE_DONTCACHE)) + return false; + /* retain; LRU fodder */ dentry->d_lockref.count--; if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) @@ -656,6 +660,21 @@ static inline bool retain_dentry(struct dentry *dentry) return true; } +void d_mark_dontcache(struct inode *inode) +{ + struct dentry *de; + + spin_lock(&inode->i_lock); + hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) { + spin_lock(&de->d_lock); + de->d_flags |= DCACHE_DONTCACHE; + spin_unlock(&de->d_lock); + } + inode->i_state |= I_DONTCACHE; + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL(d_mark_dontcache); + /* * Finish off a dentry we've decided to kill. * dentry->d_lock must be held, returns with it unlocked. diff --git a/fs/direct-io.c b/fs/direct-io.c index c44d60f375bc..1543b5af400e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -481,7 +481,7 @@ static struct bio *dio_await_one(struct dio *dio) spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true)) - io_schedule(); + blk_io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); dio->waiter = NULL; diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig index 522c35d5292b..1bdeaa6d5790 100644 --- a/fs/ecryptfs/Kconfig +++ b/fs/ecryptfs/Kconfig @@ -7,7 +7,7 @@ config ECRYPT_FS select CRYPTO_MD5 help Encrypted filesystem that operates on the VFS layer. See - <file:Documentation/filesystems/ecryptfs.txt> to learn more about + <file:Documentation/filesystems/ecryptfs.rst> to learn more about eCryptfs. Userspace components are required and can be obtained from <http://ecryptfs.sf.net>. diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 2c449aed1b92..0681540c48d9 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -48,18 +48,6 @@ void ecryptfs_from_hex(char *dst, char *src, int dst_size) } } -static int ecryptfs_hash_digest(struct crypto_shash *tfm, - char *src, int len, char *dst) -{ - SHASH_DESC_ON_STACK(desc, tfm); - int err; - - desc->tfm = tfm; - err = crypto_shash_digest(desc, src, len, dst); - shash_desc_zero(desc); - return err; -} - /** * ecryptfs_calculate_md5 - calculates the md5 of @src * @dst: Pointer to 16 bytes of allocated memory @@ -74,11 +62,8 @@ static int ecryptfs_calculate_md5(char *dst, struct ecryptfs_crypt_stat *crypt_stat, char *src, int len) { - struct crypto_shash *tfm; - int rc = 0; + int rc = crypto_shash_tfm_digest(crypt_stat->hash_tfm, src, len, dst); - tfm = crypt_stat->hash_tfm; - rc = ecryptfs_hash_digest(tfm, src, len, dst); if (rc) { printk(KERN_ERR "%s: Error computing crypto hash; rc = [%d]\n", diff --git a/fs/erofs/data.c b/fs/erofs/data.c index fc3a8d8064f8..d0542151e8c4 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -280,47 +280,36 @@ static int erofs_raw_access_readpage(struct file *file, struct page *page) return 0; } -static int erofs_raw_access_readpages(struct file *filp, - struct address_space *mapping, - struct list_head *pages, - unsigned int nr_pages) +static void erofs_raw_access_readahead(struct readahead_control *rac) { erofs_off_t last_block; struct bio *bio = NULL; - gfp_t gfp = readahead_gfp_mask(mapping); - struct page *page = list_last_entry(pages, struct page, lru); - - trace_erofs_readpages(mapping->host, page, nr_pages, true); + struct page *page; - for (; nr_pages; --nr_pages) { - page = list_entry(pages->prev, struct page, lru); + trace_erofs_readpages(rac->mapping->host, readahead_index(rac), + readahead_count(rac), true); + while ((page = readahead_page(rac))) { prefetchw(&page->flags); - list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) { - bio = erofs_read_raw_page(bio, mapping, page, - &last_block, nr_pages, true); + bio = erofs_read_raw_page(bio, rac->mapping, page, &last_block, + readahead_count(rac), true); - /* all the page errors are ignored when readahead */ - if (IS_ERR(bio)) { - pr_err("%s, readahead error at page %lu of nid %llu\n", - __func__, page->index, - EROFS_I(mapping->host)->nid); + /* all the page errors are ignored when readahead */ + if (IS_ERR(bio)) { + pr_err("%s, readahead error at page %lu of nid %llu\n", + __func__, page->index, + EROFS_I(rac->mapping->host)->nid); - bio = NULL; - } + bio = NULL; } - /* pages could still be locked */ put_page(page); } - DBG_BUGON(!list_empty(pages)); /* the rare case (end in gaps) */ if (bio) submit_bio(bio); - return 0; } static int erofs_get_block(struct inode *inode, sector_t iblock, @@ -358,7 +347,7 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block) /* for uncompressed (aligned) files and raw access for other files */ const struct address_space_operations erofs_raw_access_aops = { .readpage = erofs_raw_access_readpage, - .readpages = erofs_raw_access_readpages, + .readahead = erofs_raw_access_readahead, .bmap = erofs_bmap, }; diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 5d2d81940679..7628816f2453 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -274,7 +274,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, i = 0; while (1) { - dst = vm_map_ram(rq->out, nrpages_out, -1, PAGE_KERNEL); + dst = vm_map_ram(rq->out, nrpages_out, -1); /* retry two more times (totally 3 times) */ if (dst || ++i >= 3) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index c4b6c9aa87ec..187f93b4900e 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1305,28 +1305,23 @@ static bool should_decompress_synchronously(struct erofs_sb_info *sbi, return nr <= sbi->max_sync_decompress_pages; } -static int z_erofs_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void z_erofs_readahead(struct readahead_control *rac) { - struct inode *const inode = mapping->host; + struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - bool sync = should_decompress_synchronously(sbi, nr_pages); + bool sync = should_decompress_synchronously(sbi, readahead_count(rac)); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); - struct page *head = NULL; + struct page *page, *head = NULL; LIST_HEAD(pagepool); - trace_erofs_readpages(mapping->host, lru_to_page(pages), - nr_pages, false); + trace_erofs_readpages(inode, readahead_index(rac), + readahead_count(rac), false); - f.headoffset = (erofs_off_t)lru_to_page(pages)->index << PAGE_SHIFT; - - for (; nr_pages; --nr_pages) { - struct page *page = lru_to_page(pages); + f.headoffset = readahead_pos(rac); + while ((page = readahead_page(rac))) { prefetchw(&page->flags); - list_del(&page->lru); /* * A pure asynchronous readahead is indicated if @@ -1335,11 +1330,6 @@ static int z_erofs_readpages(struct file *filp, struct address_space *mapping, */ sync &= !(PageReadahead(page) && !head); - if (add_to_page_cache_lru(page, mapping, page->index, gfp)) { - list_add(&page->lru, &pagepool); - continue; - } - set_page_private(page, (unsigned long)head); head = page; } @@ -1368,11 +1358,10 @@ static int z_erofs_readpages(struct file *filp, struct address_space *mapping, /* clean up the remaining free pages */ put_pages_list(&pagepool); - return 0; } const struct address_space_operations z_erofs_aops = { .readpage = z_erofs_readpage, - .readpages = z_erofs_readpages, + .readahead = z_erofs_readahead, }; diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 06887492f54b..785ead346543 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -372,10 +372,9 @@ static int exfat_readpage(struct file *file, struct page *page) return mpage_readpage(page, exfat_get_block); } -static int exfat_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void exfat_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, exfat_get_block); + mpage_readahead(rac, exfat_get_block); } static int exfat_writepage(struct page *page, struct writeback_control *wbc) @@ -502,7 +501,7 @@ int exfat_block_truncate_page(struct inode *inode, loff_t from) static const struct address_space_operations exfat_aops = { .readpage = exfat_readpage, - .readpages = exfat_readpages, + .readahead = exfat_readahead, .writepage = exfat_writepage, .writepages = exfat_writepages, .write_begin = exfat_write_begin, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index c885cf7d724b..2875c0a705b5 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -877,11 +877,9 @@ static int ext2_readpage(struct file *file, struct page *page) return mpage_readpage(page, ext2_get_block); } -static int -ext2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void ext2_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, ext2_get_block); + mpage_readahead(rac, ext2_get_block); } static int @@ -967,7 +965,7 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc const struct address_space_operations ext2_aops = { .readpage = ext2_readpage, - .readpages = ext2_readpages, + .readahead = ext2_readahead, .writepage = ext2_writepage, .write_begin = ext2_write_begin, .write_end = ext2_write_end, @@ -981,7 +979,7 @@ const struct address_space_operations ext2_aops = { const struct address_space_operations ext2_nobh_aops = { .readpage = ext2_readpage, - .readpages = ext2_readpages, + .readahead = ext2_readahead, .writepage = ext2_nobh_writepage, .write_begin = ext2_nobh_write_begin, .write_end = nobh_write_end, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ad2dbf6e4924..15b062efcff1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1357,11 +1357,9 @@ struct ext4_super_block { */ #define EXT4_MF_MNTDIR_SAMPLED 0x0001 #define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ -#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004 #ifdef CONFIG_FS_ENCRYPTION -#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \ - EXT4_MF_TEST_DUMMY_ENCRYPTION)) +#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_ctx.ctx != NULL) #else #define DUMMY_ENCRYPTION_ENABLED(sbi) (0) #endif @@ -1551,6 +1549,9 @@ struct ext4_sb_info { struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; + /* Encryption context for '-o test_dummy_encryption' */ + struct fscrypt_dummy_context s_dummy_enc_ctx; + /* * Barrier between writepages ops and changing any inode's JOURNAL_DATA * or EXTENTS flag. @@ -3316,9 +3317,8 @@ static inline void ext4_set_de_type(struct super_block *sb, } /* readpages.c */ -extern int ext4_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead); +extern int ext4_mpage_readpages(struct inode *inode, + struct readahead_control *rac, struct page *page); extern int __init ext4_init_post_read_processing(void); extern void ext4_exit_post_read_processing(void); diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index e10206e7f4bb..35ff9a56db67 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -176,7 +176,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = ext4_fsync_journal(inode, datasync, &needs_barrier); if (needs_barrier) { - err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); if (!ret) ret = err; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 4b8c9a9bdf0c..499f08d8522e 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1440,7 +1440,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, if (ret < 0) goto err_out; if (barrier) - blkdev_issue_flush(sb->s_bdev, GFP_NOFS, NULL); + blkdev_issue_flush(sb->s_bdev, GFP_NOFS); skip_zeroout: ext4_lock_group(sb, group); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2a4aae6acdcb..52be85f96159 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3224,23 +3224,20 @@ static int ext4_readpage(struct file *file, struct page *page) ret = ext4_readpage_inline(inode, page); if (ret == -EAGAIN) - return ext4_mpage_readpages(page->mapping, NULL, page, 1, - false); + return ext4_mpage_readpages(inode, NULL, page); return ret; } -static int -ext4_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void ext4_readahead(struct readahead_control *rac) { - struct inode *inode = mapping->host; + struct inode *inode = rac->mapping->host; - /* If the file has inline data, no need to do readpages. */ + /* If the file has inline data, no need to do readahead. */ if (ext4_has_inline_data(inode)) - return 0; + return; - return ext4_mpage_readpages(mapping, pages, NULL, nr_pages, true); + ext4_mpage_readpages(inode, rac, NULL); } static void ext4_invalidatepage(struct page *page, unsigned int offset, @@ -3605,7 +3602,7 @@ static int ext4_set_page_dirty(struct page *page) static const struct address_space_operations ext4_aops = { .readpage = ext4_readpage, - .readpages = ext4_readpages, + .readahead = ext4_readahead, .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_write_begin, @@ -3622,7 +3619,7 @@ static const struct address_space_operations ext4_aops = { static const struct address_space_operations ext4_journalled_aops = { .readpage = ext4_readpage, - .readpages = ext4_readpages, + .readahead = ext4_readahead, .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_write_begin, @@ -3638,7 +3635,7 @@ static const struct address_space_operations ext4_journalled_aops = { static const struct address_space_operations ext4_da_aops = { .readpage = ext4_readpage, - .readpages = ext4_readpages, + .readahead = ext4_readahead, .writepage = ext4_writepage, .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index c1769afbf799..5761e9961682 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -7,8 +7,8 @@ * * This was originally taken from fs/mpage.c * - * The intent is the ext4_mpage_readpages() function here is intended - * to replace mpage_readpages() in the general case, not just for + * The ext4_mpage_readpages() function here is intended to + * replace mpage_readahead() in the general case, not just for * encrypted files. It has some limitations (see below), where it * will fall back to read_block_full_page(), but these limitations * should only be hit when page_size != block_size. @@ -221,14 +221,12 @@ static inline loff_t ext4_readpage_limit(struct inode *inode) return i_size_read(inode); } -int ext4_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead) +int ext4_mpage_readpages(struct inode *inode, + struct readahead_control *rac, struct page *page) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; - struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; @@ -241,6 +239,7 @@ int ext4_mpage_readpages(struct address_space *mapping, int length; unsigned relative_block = 0; struct ext4_map_blocks map; + unsigned int nr_pages = rac ? readahead_count(rac) : 1; map.m_pblk = 0; map.m_lblk = 0; @@ -251,14 +250,9 @@ int ext4_mpage_readpages(struct address_space *mapping, int fully_mapped = 1; unsigned first_hole = blocks_per_page; - if (pages) { - page = lru_to_page(pages); - + if (rac) { + page = readahead_page(rac); prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, page->index, - readahead_gfp_mask(mapping))) - goto next_page; } if (page_has_buffers(page)) @@ -381,7 +375,7 @@ int ext4_mpage_readpages(struct address_space *mapping, bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; bio_set_op_attrs(bio, REQ_OP_READ, - is_readahead ? REQ_RAHEAD : 0); + rac ? REQ_RAHEAD : 0); } length = first_hole << blkbits; @@ -406,10 +400,9 @@ int ext4_mpage_readpages(struct address_space *mapping, else unlock_page(page); next_page: - if (pages) + if (rac) put_page(page); } - BUG_ON(pages && !list_empty(pages)); if (bio) submit_bio(bio); return 0; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bf5fcb477f66..9824cd8203e8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1106,6 +1106,7 @@ static void ext4_put_super(struct super_block *sb) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev); + fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx); #ifdef CONFIG_UNICODE utf8_unload(sbi->s_encoding); #endif @@ -1389,9 +1390,10 @@ retry: return res; } -static bool ext4_dummy_context(struct inode *inode) +static const union fscrypt_context * +ext4_get_dummy_context(struct super_block *sb) { - return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb)); + return EXT4_SB(sb)->s_dummy_enc_ctx.ctx; } static bool ext4_has_stable_inodes(struct super_block *sb) @@ -1410,7 +1412,7 @@ static const struct fscrypt_operations ext4_cryptops = { .key_prefix = "ext4:", .get_context = ext4_get_context, .set_context = ext4_set_context, - .dummy_context = ext4_dummy_context, + .get_dummy_context = ext4_get_dummy_context, .empty_dir = ext4_empty_dir, .max_namelen = EXT4_NAME_LEN, .has_stable_inodes = ext4_has_stable_inodes, @@ -1605,6 +1607,7 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, + {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_nombcache, "nombcache"}, {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ @@ -1816,7 +1819,7 @@ static const struct mount_opts { {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, {Opt_max_dir_size_kb, 0, MOPT_GTE0}, - {Opt_test_dummy_encryption, 0, MOPT_GTE0}, + {Opt_test_dummy_encryption, 0, MOPT_STRING}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_err, 0, 0} }; @@ -1851,6 +1854,48 @@ static int ext4_sb_read_encoding(const struct ext4_super_block *es, } #endif +static int ext4_set_test_dummy_encryption(struct super_block *sb, + const char *opt, + const substring_t *arg, + bool is_remount) +{ +#ifdef CONFIG_FS_ENCRYPTION + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; + + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if (is_remount && !sbi->s_dummy_enc_ctx.ctx) { + ext4_msg(sb, KERN_WARNING, + "Can't set test_dummy_encryption on remount"); + return -1; + } + err = fscrypt_set_test_dummy_encryption(sb, arg, &sbi->s_dummy_enc_ctx); + if (err) { + if (err == -EEXIST) + ext4_msg(sb, KERN_WARNING, + "Can't change test_dummy_encryption on remount"); + else if (err == -EINVAL) + ext4_msg(sb, KERN_WARNING, + "Value of option \"%s\" is unrecognized", opt); + else + ext4_msg(sb, KERN_WARNING, + "Error processing option \"%s\" [%d]", + opt, err); + return -1; + } + ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); +#else + ext4_msg(sb, KERN_WARNING, + "Test dummy encryption mount option ignored"); +#endif + return 1; +} + static int handle_mount_opt(struct super_block *sb, char *opt, int token, substring_t *args, unsigned long *journal_devnum, unsigned int *journal_ioprio, int is_remount) @@ -2047,14 +2092,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); } else if (token == Opt_test_dummy_encryption) { -#ifdef CONFIG_FS_ENCRYPTION - sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION; - ext4_msg(sb, KERN_WARNING, - "Test dummy encryption mode enabled"); -#else - ext4_msg(sb, KERN_WARNING, - "Test dummy encryption mount option ignored"); -#endif + return ext4_set_test_dummy_encryption(sb, opt, &args[0], + is_remount); } else if (m->flags & MOPT_DATAJ) { if (is_remount) { if (!sbi->s_journal) @@ -2311,8 +2350,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); if (test_opt(sb, DATA_ERR_ABORT)) SEQ_OPTS_PUTS("data_err=abort"); - if (DUMMY_ENCRYPTION_ENABLED(sbi)) - SEQ_OPTS_PUTS("test_dummy_encryption"); + + fscrypt_show_test_dummy_encryption(seq, sep, sb); ext4_show_quota_options(seq, sb); return 0; @@ -4780,6 +4819,7 @@ failed_mount: for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif + fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx); ext4_blkdev_remove(sbi); brelse(bh); out_fail: @@ -5256,7 +5296,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) needs_barrier = true; if (needs_barrier) { int err; - err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); + err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); if (!ret) ret = err; } diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 04bfaf63752c..6c9fc9e21c13 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -293,6 +293,7 @@ EXT4_ATTR_FEATURE(batched_discard); EXT4_ATTR_FEATURE(meta_bg_resize); #ifdef CONFIG_FS_ENCRYPTION EXT4_ATTR_FEATURE(encryption); +EXT4_ATTR_FEATURE(test_dummy_encryption_v2); #endif #ifdef CONFIG_UNICODE EXT4_ATTR_FEATURE(casefold); @@ -308,6 +309,7 @@ static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(meta_bg_resize), #ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), + ATTR_LIST(test_dummy_encryption_v2), #endif #ifdef CONFIG_UNICODE ATTR_LIST(casefold), diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index dc5ec724d889..dec1244dd062 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -342,37 +342,6 @@ static int ext4_get_verity_descriptor(struct inode *inode, void *buf, return desc_size; } -/* - * Prefetch some pages from the file's Merkle tree. - * - * This is basically a stripped-down version of __do_page_cache_readahead() - * which works on pages past i_size. - */ -static void ext4_merkle_tree_readahead(struct address_space *mapping, - pgoff_t start_index, unsigned long count) -{ - LIST_HEAD(pages); - unsigned int nr_pages = 0; - struct page *page; - pgoff_t index; - struct blk_plug plug; - - for (index = start_index; index < start_index + count; index++) { - page = xa_load(&mapping->i_pages, index); - if (!page || xa_is_value(page)) { - page = __page_cache_alloc(readahead_gfp_mask(mapping)); - if (!page) - break; - page->index = index; - list_add(&page->lru, &pages); - nr_pages++; - } - } - blk_start_plug(&plug); - ext4_mpage_readpages(mapping, &pages, NULL, nr_pages, true); - blk_finish_plug(&plug); -} - static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) @@ -386,8 +355,8 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - ext4_merkle_tree_readahead(inode->i_mapping, index, - num_ra_pages); + page_cache_readahead_unbounded(inode->i_mapping, NULL, + index, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index cdf2f626bea7..03ec97f28235 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2177,13 +2177,11 @@ out: * use ->readpage() or do the necessary surgery to decouple ->readpages() * from read-ahead. */ -int f2fs_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead) +static int f2fs_mpage_readpages(struct inode *inode, + struct readahead_control *rac, struct page *page) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; - struct inode *inode = mapping->host; struct f2fs_map_blocks map; #ifdef CONFIG_F2FS_FS_COMPRESSION struct compress_ctx cc = { @@ -2197,6 +2195,7 @@ int f2fs_mpage_readpages(struct address_space *mapping, .nr_cpages = 0, }; #endif + unsigned nr_pages = rac ? readahead_count(rac) : 1; unsigned max_nr_pages = nr_pages; int ret = 0; @@ -2210,15 +2209,9 @@ int f2fs_mpage_readpages(struct address_space *mapping, map.m_may_create = false; for (; nr_pages; nr_pages--) { - if (pages) { - page = list_last_entry(pages, struct page, lru); - + if (rac) { + page = readahead_page(rac); prefetchw(&page->flags); - list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, - page_index(page), - readahead_gfp_mask(mapping))) - goto next_page; } #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -2228,7 +2221,7 @@ int f2fs_mpage_readpages(struct address_space *mapping, ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, - is_readahead, false); + rac != NULL, false); f2fs_destroy_compress_ctx(&cc); if (ret) goto set_error_page; @@ -2251,7 +2244,7 @@ read_single_page: #endif ret = f2fs_read_single_page(inode, page, max_nr_pages, &map, - &bio, &last_block_in_bio, is_readahead); + &bio, &last_block_in_bio, rac); if (ret) { #ifdef CONFIG_F2FS_FS_COMPRESSION set_error_page: @@ -2260,8 +2253,10 @@ set_error_page: zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); } +#ifdef CONFIG_F2FS_FS_COMPRESSION next_page: - if (pages) +#endif + if (rac) put_page(page); #ifdef CONFIG_F2FS_FS_COMPRESSION @@ -2271,16 +2266,15 @@ next_page: ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, - is_readahead, false); + rac != NULL, false); f2fs_destroy_compress_ctx(&cc); } } #endif } - BUG_ON(pages && !list_empty(pages)); if (bio) __submit_bio(F2FS_I_SB(inode), bio, DATA); - return pages ? 0 : ret; + return ret; } static int f2fs_read_data_page(struct file *file, struct page *page) @@ -2299,28 +2293,24 @@ static int f2fs_read_data_page(struct file *file, struct page *page) if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); if (ret == -EAGAIN) - ret = f2fs_mpage_readpages(page_file_mapping(page), - NULL, page, 1, false); + ret = f2fs_mpage_readpages(inode, NULL, page); return ret; } -static int f2fs_read_data_pages(struct file *file, - struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void f2fs_readahead(struct readahead_control *rac) { - struct inode *inode = mapping->host; - struct page *page = list_last_entry(pages, struct page, lru); + struct inode *inode = rac->mapping->host; - trace_f2fs_readpages(inode, page, nr_pages); + trace_f2fs_readpages(inode, readahead_index(rac), readahead_count(rac)); if (!f2fs_is_compress_backend_ready(inode)) - return 0; + return; /* If the file has inline data, skip readpages */ if (f2fs_has_inline_data(inode)) - return 0; + return; - return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages, true); + f2fs_mpage_readpages(inode, rac, NULL); } int f2fs_encrypt_one_page(struct f2fs_io_info *fio) @@ -3805,7 +3795,7 @@ static void f2fs_swap_deactivate(struct file *file) const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, - .readpages = f2fs_read_data_pages, + .readahead = f2fs_readahead, .writepage = f2fs_write_data_page, .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ba470d5687fe..5c0149d2f46a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -138,7 +138,7 @@ struct f2fs_mount_info { int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ int bggc_mode; /* bggc mode: off, on or sync */ - bool test_dummy_encryption; /* test dummy encryption */ + struct fscrypt_dummy_context dummy_enc_ctx; /* test dummy encryption */ block_t unusable_cap; /* Amount of space allowed to be * unusable when disabling checkpoint */ @@ -1259,7 +1259,7 @@ enum fsync_mode { #ifdef CONFIG_FS_ENCRYPTION #define DUMMY_ENCRYPTION_ENABLED(sbi) \ - (unlikely(F2FS_OPTION(sbi).test_dummy_encryption)) + (unlikely(F2FS_OPTION(sbi).dummy_enc_ctx.ctx != NULL)) #else #define DUMMY_ENCRYPTION_ENABLED(sbi) (0) #endif @@ -3051,19 +3051,12 @@ static inline void f2fs_set_page_private(struct page *page, if (PagePrivate(page)) return; - get_page(page); - SetPagePrivate(page); - set_page_private(page, data); + attach_page_private(page, (void *)data); } static inline void f2fs_clear_page_private(struct page *page) { - if (!PagePrivate(page)) - return; - - set_page_private(page, 0); - ClearPagePrivate(page); - f2fs_put_page(page, 0); + detach_page_private(page); } /* @@ -3373,9 +3366,6 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); -int f2fs_mpage_readpages(struct address_space *mapping, - struct list_head *pages, struct page *page, - unsigned nr_pages, bool is_readahead); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, int op_flags, bool for_write); struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index); diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 5bc4dcd8fc03..8c4ea5003ef8 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -12,7 +12,6 @@ #include <linux/types.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> -#include <linux/cryptohash.h> #include <linux/pagemap.h> #include <linux/unicode.h> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f2dfc21c6abb..8a9955902d84 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -202,6 +202,7 @@ static match_table_t f2fs_tokens = { {Opt_whint, "whint_mode=%s"}, {Opt_alloc, "alloc_mode=%s"}, {Opt_fsync, "fsync_mode=%s"}, + {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_checkpoint_disable, "checkpoint=disable"}, {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, @@ -394,7 +395,52 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) } #endif -static int parse_options(struct super_block *sb, char *options) +static int f2fs_set_test_dummy_encryption(struct super_block *sb, + const char *opt, + const substring_t *arg, + bool is_remount) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); +#ifdef CONFIG_FS_ENCRYPTION + int err; + + if (!f2fs_sb_has_encrypt(sbi)) { + f2fs_err(sbi, "Encrypt feature is off"); + return -EINVAL; + } + + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if (is_remount && !F2FS_OPTION(sbi).dummy_enc_ctx.ctx) { + f2fs_warn(sbi, "Can't set test_dummy_encryption on remount"); + return -EINVAL; + } + err = fscrypt_set_test_dummy_encryption( + sb, arg, &F2FS_OPTION(sbi).dummy_enc_ctx); + if (err) { + if (err == -EEXIST) + f2fs_warn(sbi, + "Can't change test_dummy_encryption on remount"); + else if (err == -EINVAL) + f2fs_warn(sbi, "Value of option \"%s\" is unrecognized", + opt); + else + f2fs_warn(sbi, "Error processing option \"%s\" [%d]", + opt, err); + return -EINVAL; + } + f2fs_warn(sbi, "Test dummy encryption mode enabled"); +#else + f2fs_warn(sbi, "Test dummy encryption mount option ignored"); +#endif + return 0; +} + +static int parse_options(struct super_block *sb, char *options, bool is_remount) { struct f2fs_sb_info *sbi = F2FS_SB(sb); substring_t args[MAX_OPT_ARGS]; @@ -403,9 +449,7 @@ static int parse_options(struct super_block *sb, char *options) int arg = 0, ext_cnt; kuid_t uid; kgid_t gid; -#ifdef CONFIG_QUOTA int ret; -#endif if (!options) return 0; @@ -778,17 +822,10 @@ static int parse_options(struct super_block *sb, char *options) kvfree(name); break; case Opt_test_dummy_encryption: -#ifdef CONFIG_FS_ENCRYPTION - if (!f2fs_sb_has_encrypt(sbi)) { - f2fs_err(sbi, "Encrypt feature is off"); - return -EINVAL; - } - - F2FS_OPTION(sbi).test_dummy_encryption = true; - f2fs_info(sbi, "Test dummy encryption mode enabled"); -#else - f2fs_info(sbi, "Test dummy encryption mount option ignored"); -#endif + ret = f2fs_set_test_dummy_encryption(sb, p, &args[0], + is_remount); + if (ret) + return ret; break; case Opt_checkpoint_disable_cap_perc: if (args->from && match_int(args, &arg)) @@ -1213,6 +1250,7 @@ static void f2fs_put_super(struct super_block *sb) for (i = 0; i < MAXQUOTAS; i++) kvfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif + fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); @@ -1543,10 +1581,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",whint_mode=%s", "user-based"); else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) seq_printf(seq, ",whint_mode=%s", "fs-based"); -#ifdef CONFIG_FS_ENCRYPTION - if (F2FS_OPTION(sbi).test_dummy_encryption) - seq_puts(seq, ",test_dummy_encryption"); -#endif + + fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb); if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT) seq_printf(seq, ",alloc_mode=%s", "default"); @@ -1575,7 +1611,6 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; - F2FS_OPTION(sbi).test_dummy_encryption = false; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; @@ -1734,7 +1769,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) default_options(sbi); /* parse mount options */ - err = parse_options(sb, data); + err = parse_options(sb, data, true); if (err) goto restore_opts; checkpoint_changed = @@ -2410,9 +2445,10 @@ static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, ctx, len, fs_data, XATTR_CREATE); } -static bool f2fs_dummy_context(struct inode *inode) +static const union fscrypt_context * +f2fs_get_dummy_context(struct super_block *sb) { - return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode)); + return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_ctx.ctx; } static bool f2fs_has_stable_inodes(struct super_block *sb) @@ -2431,7 +2467,7 @@ static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, - .dummy_context = f2fs_dummy_context, + .get_dummy_context = f2fs_get_dummy_context, .empty_dir = f2fs_empty_dir, .max_namelen = F2FS_NAME_LEN, .has_stable_inodes = f2fs_has_stable_inodes, @@ -3366,7 +3402,7 @@ try_onemore: goto free_sb_buf; } - err = parse_options(sb, options); + err = parse_options(sb, options, false); if (err) goto free_options; @@ -3769,6 +3805,7 @@ free_options: for (i = 0; i < MAXQUOTAS; i++) kvfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif + fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); kvfree(options); free_sb_buf: kvfree(raw_super); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index e3bbbef9b4f0..3162f46b3c9b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -446,6 +446,7 @@ enum feat_id { FEAT_SB_CHECKSUM, FEAT_CASEFOLD, FEAT_COMPRESSION, + FEAT_TEST_DUMMY_ENCRYPTION_V2, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -466,6 +467,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_SB_CHECKSUM: case FEAT_CASEFOLD: case FEAT_COMPRESSION: + case FEAT_TEST_DUMMY_ENCRYPTION_V2: return sprintf(buf, "supported\n"); } return 0; @@ -563,6 +565,7 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks); #ifdef CONFIG_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); +F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2, FEAT_TEST_DUMMY_ENCRYPTION_V2); #endif #ifdef CONFIG_BLK_DEV_ZONED F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); @@ -647,6 +650,7 @@ ATTRIBUTE_GROUPS(f2fs); static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), + ATTR_LIST(test_dummy_encryption_v2), #endif #ifdef CONFIG_BLK_DEV_ZONED ATTR_LIST(block_zoned), diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index d7d430a6f130..865c9fb774fb 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -222,37 +222,6 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, return size; } -/* - * Prefetch some pages from the file's Merkle tree. - * - * This is basically a stripped-down version of __do_page_cache_readahead() - * which works on pages past i_size. - */ -static void f2fs_merkle_tree_readahead(struct address_space *mapping, - pgoff_t start_index, unsigned long count) -{ - LIST_HEAD(pages); - unsigned int nr_pages = 0; - struct page *page; - pgoff_t index; - struct blk_plug plug; - - for (index = start_index; index < start_index + count; index++) { - page = xa_load(&mapping->i_pages, index); - if (!page || xa_is_value(page)) { - page = __page_cache_alloc(readahead_gfp_mask(mapping)); - if (!page) - break; - page->index = index; - list_add(&page->lru, &pages); - nr_pages++; - } - } - blk_start_plug(&plug); - f2fs_mpage_readpages(mapping, &pages, NULL, nr_pages, true); - blk_finish_plug(&plug); -} - static struct page *f2fs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) @@ -266,8 +235,8 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - f2fs_merkle_tree_readahead(inode->i_mapping, index, - num_ra_pages); + page_cache_readahead_unbounded(inode->i_mapping, NULL, + index, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig index 718163d0c621..ca31993dcb47 100644 --- a/fs/fat/Kconfig +++ b/fs/fat/Kconfig @@ -69,7 +69,7 @@ config VFAT_FS The VFAT support enlarges your kernel by about 10 KB and it only works if you said Y to the "DOS FAT fs support" above. Please read - the file <file:Documentation/filesystems/vfat.txt> for details. If + the file <file:Documentation/filesystems/vfat.rst> for details. If unsure, say Y. To compile this as a module, choose M here: the module will be called @@ -82,7 +82,7 @@ config FAT_DEFAULT_CODEPAGE help This option should be set to the codepage of your FAT filesystems. It can be overridden with the "codepage" mount option. - See <file:Documentation/filesystems/vfat.txt> for more information. + See <file:Documentation/filesystems/vfat.rst> for more information. config FAT_DEFAULT_IOCHARSET string "Default iocharset for FAT" @@ -96,7 +96,7 @@ config FAT_DEFAULT_IOCHARSET Note that "utf8" is not recommended for FAT filesystems. If unsure, you shouldn't set "utf8" here - select the next option instead if you would like to use UTF-8 encoded file names by default. - See <file:Documentation/filesystems/vfat.txt> for more information. + See <file:Documentation/filesystems/vfat.rst> for more information. Enable any character sets you need in File Systems/Native Language Support. @@ -114,4 +114,4 @@ config FAT_DEFAULT_UTF8 Say Y if you use UTF-8 encoding for file names, N otherwise. - See <file:Documentation/filesystems/vfat.txt> for more information. + See <file:Documentation/filesystems/vfat.rst> for more information. diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 054acd9fd033..b4ddf48fa444 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -804,8 +804,6 @@ static long fat_dir_ioctl(struct file *filp, unsigned int cmd, return fat_generic_ioctl(filp, cmd, arg); } - if (!access_ok(d1, sizeof(struct __fat_dirent[2]))) - return -EFAULT; /* * Yes, we don't need this put_user() absolutely. However old * code didn't return the right value. So, app use this value, @@ -844,8 +842,6 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd, return fat_generic_ioctl(filp, cmd, (unsigned long)arg); } - if (!access_ok(d1, sizeof(struct compat_dirent[2]))) - return -EFAULT; /* * Yes, we don't need this put_user() absolutely. However old * code didn't return the right value. So, app use this value, diff --git a/fs/fat/file.c b/fs/fat/file.c index bdc4503c00a3..42134c58c87e 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -195,7 +195,7 @@ int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) if (err) return err; - return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 71946da84388..e6e68b2274a5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -210,10 +210,9 @@ static int fat_readpage(struct file *file, struct page *page) return mpage_readpage(page, fat_get_block); } -static int fat_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void fat_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, fat_get_block); + mpage_readahead(rac, fat_get_block); } static void fat_write_failed(struct address_space *mapping, loff_t to) @@ -344,7 +343,7 @@ int fat_block_truncate_page(struct inode *inode, loff_t from) static const struct address_space_operations fat_aops = { .readpage = fat_readpage, - .readpages = fat_readpages, + .readahead = fat_readahead, .writepage = fat_writepage, .writepages = fat_writepages, .write_begin = fat_write_begin, diff --git a/fs/file_table.c b/fs/file_table.c index 30d55c9a1744..676e620948d2 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -198,6 +198,7 @@ static struct file *alloc_file(const struct path *path, int flags, file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); + file->f_sb_err = file_sample_sb_err(file); if ((file->f_mode & FMODE_READ) && likely(fop->read || fop->read_iter)) file->f_mode |= FMODE_CAN_READ; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 76ac9c7d32ec..a750381d554a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1070,7 +1070,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, static unsigned long get_nr_dirty_pages(void) { return global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS) + get_nr_dirty_inodes(); } @@ -2320,7 +2319,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) WARN(bdi_cap_writeback_dirty(wb->bdi) && !test_bit(WB_registered, &wb->state), - "bdi-%s not registered\n", wb->bdi->name); + "bdi-%s not registered\n", bdi_dev_name(wb->bdi)); inode->dirtied_when = jiffies; if (dirtytime) diff --git a/fs/fs_context.c b/fs/fs_context.c index fc9f6ef93b55..7d5c5dd2b1d5 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -42,7 +42,6 @@ static const struct constant_table common_set_sb_flag[] = { { "dirsync", SB_DIRSYNC }, { "lazytime", SB_LAZYTIME }, { "mand", SB_MANDLOCK }, - { "posixacl", SB_POSIXACL }, { "ro", SB_RDONLY }, { "sync", SB_SYNCHRONOUS }, { }, @@ -53,44 +52,15 @@ static const struct constant_table common_clear_sb_flag[] = { { "nolazytime", SB_LAZYTIME }, { "nomand", SB_MANDLOCK }, { "rw", SB_RDONLY }, - { "silent", SB_SILENT }, { }, }; -static const char *const forbidden_sb_flag[] = { - "bind", - "dev", - "exec", - "move", - "noatime", - "nodev", - "nodiratime", - "noexec", - "norelatime", - "nostrictatime", - "nosuid", - "private", - "rec", - "relatime", - "remount", - "shared", - "slave", - "strictatime", - "suid", - "unbindable", -}; - /* * Check for a common mount option that manipulates s_flags. */ static int vfs_parse_sb_flag(struct fs_context *fc, const char *key) { unsigned int token; - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(forbidden_sb_flag); i++) - if (strcmp(key, forbidden_sb_flag[i]) == 0) - return -EINVAL; token = lookup_constant(common_set_sb_flag, key, 0); if (token) { diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig index 506c5e643f0d..5e796e6c38e5 100644 --- a/fs/fscache/Kconfig +++ b/fs/fscache/Kconfig @@ -8,7 +8,7 @@ config FSCACHE Different sorts of caches can be plugged in, depending on the resources available. - See Documentation/filesystems/caching/fscache.txt for more information. + See Documentation/filesystems/caching/fscache.rst for more information. config FSCACHE_STATS bool "Gather statistical information on local caching" @@ -25,7 +25,7 @@ config FSCACHE_STATS between CPUs. On the other hand, the stats are very useful for debugging purposes. Saying 'Y' here is recommended. - See Documentation/filesystems/caching/fscache.txt for more information. + See Documentation/filesystems/caching/fscache.rst for more information. config FSCACHE_HISTOGRAM bool "Gather latency information on local caching" @@ -42,7 +42,7 @@ config FSCACHE_HISTOGRAM bouncing between CPUs. On the other hand, the histogram may be useful for debugging purposes. Saying 'N' here is recommended. - See Documentation/filesystems/caching/fscache.txt for more information. + See Documentation/filesystems/caching/fscache.rst for more information. config FSCACHE_DEBUG bool "Debug FS-Cache" @@ -52,7 +52,7 @@ config FSCACHE_DEBUG management module. If this is set, the debugging output may be enabled by setting bits in /sys/modules/fscache/parameter/debug. - See Documentation/filesystems/caching/fscache.txt for more information. + See Documentation/filesystems/caching/fscache.rst for more information. config FSCACHE_OBJECT_LIST bool "Maintain global object list for debugging purposes" diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index f78793f3d21e..fcc136361415 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -172,7 +172,7 @@ no_preference: * * Initialise a record of a cache and fill in the name. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_init_cache(struct fscache_cache *cache, @@ -207,7 +207,7 @@ EXPORT_SYMBOL(fscache_init_cache); * * Add a cache to the system, making it available for netfs's to use. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ int fscache_add_cache(struct fscache_cache *cache, @@ -307,7 +307,7 @@ EXPORT_SYMBOL(fscache_add_cache); * Note that an I/O error occurred in a cache and that it should no longer be * used for anything. This also reports the error into the kernel log. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_io_error(struct fscache_cache *cache) @@ -355,7 +355,7 @@ static void fscache_withdraw_all_objects(struct fscache_cache *cache, * Withdraw a cache from service, unbinding all its cache objects from the * netfs cookies they're currently representing. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_withdraw_cache(struct fscache_cache *cache) diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 0ce39658a620..751bc5b1cddf 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -4,7 +4,7 @@ * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * - * See Documentation/filesystems/caching/netfs-api.txt for more information on + * See Documentation/filesystems/caching/netfs-api.rst for more information on * the netfs API. */ diff --git a/fs/fscache/object.c b/fs/fscache/object.c index cfeba839a0f2..cb2146e02cd5 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -4,7 +4,7 @@ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * - * See Documentation/filesystems/caching/object.txt for a description of the + * See Documentation/filesystems/caching/object.rst for a description of the * object state machine and the in-kernel representations. */ @@ -295,7 +295,7 @@ static void fscache_object_work_func(struct work_struct *work) * * Initialise a cache object description to its basic values. * - * See Documentation/filesystems/caching/backend-api.txt for a complete + * See Documentation/filesystems/caching/backend-api.rst for a complete * description. */ void fscache_object_init(struct fscache_object *object, diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 1a22a55f75a0..4a5651d4904e 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -4,7 +4,7 @@ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * - * See Documentation/filesystems/caching/operations.txt + * See Documentation/filesystems/caching/operations.rst */ #define FSCACHE_DEBUG_LEVEL OPERATION diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index eb2a585572dc..774b2618018a 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -12,7 +12,7 @@ config FUSE_FS although chances are your distribution already has that library installed if you've installed the "fuse" package itself. - See <file:Documentation/filesystems/fuse.txt> for more information. + See <file:Documentation/filesystems/fuse.rst> for more information. See <file:Documentation/Changes> for needed library/utility version. If you want to develop a userspace FS, or if you want to use diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 97eec7522bf2..c7a65cf2bcca 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2081,7 +2081,7 @@ static void end_polls(struct fuse_conn *fc) * The same effect is usually achievable through killing the filesystem daemon * and all users of the filesystem. The exception is the combination of an * asynchronous request and the tricky deadlock (see - * Documentation/filesystems/fuse.txt). + * Documentation/filesystems/fuse.rst). * * Aborting requests under I/O goes as follows: 1: Separate out unlocked * requests, they should be finished off immediately. Locked requests will be diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9d67b830fb7a..bac51c32d660 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -915,84 +915,40 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) fuse_readpages_end(fc, &ap->args, err); } -struct fuse_fill_data { - struct fuse_io_args *ia; - struct file *file; - struct inode *inode; - unsigned int nr_pages; - unsigned int max_pages; -}; - -static int fuse_readpages_fill(void *_data, struct page *page) +static void fuse_readahead(struct readahead_control *rac) { - struct fuse_fill_data *data = _data; - struct fuse_io_args *ia = data->ia; - struct fuse_args_pages *ap = &ia->ap; - struct inode *inode = data->inode; + struct inode *inode = rac->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); + unsigned int i, max_pages, nr_pages = 0; - fuse_wait_on_page_writeback(inode, page->index); - - if (ap->num_pages && - (ap->num_pages == fc->max_pages || - (ap->num_pages + 1) * PAGE_SIZE > fc->max_read || - ap->pages[ap->num_pages - 1]->index + 1 != page->index)) { - data->max_pages = min_t(unsigned int, data->nr_pages, - fc->max_pages); - fuse_send_readpages(ia, data->file); - data->ia = ia = fuse_io_alloc(NULL, data->max_pages); - if (!ia) { - unlock_page(page); - return -ENOMEM; - } - ap = &ia->ap; - } - - if (WARN_ON(ap->num_pages >= data->max_pages)) { - unlock_page(page); - fuse_io_free(ia); - return -EIO; - } - - get_page(page); - ap->pages[ap->num_pages] = page; - ap->descs[ap->num_pages].length = PAGE_SIZE; - ap->num_pages++; - data->nr_pages--; - return 0; -} - -static int fuse_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) -{ - struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_fill_data data; - int err; - - err = -EIO; if (is_bad_inode(inode)) - goto out; + return; - data.file = file; - data.inode = inode; - data.nr_pages = nr_pages; - data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages); -; - data.ia = fuse_io_alloc(NULL, data.max_pages); - err = -ENOMEM; - if (!data.ia) - goto out; + max_pages = min_t(unsigned int, fc->max_pages, + fc->max_read / PAGE_SIZE); - err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); - if (!err) { - if (data.ia->ap.num_pages) - fuse_send_readpages(data.ia, file); - else - fuse_io_free(data.ia); + for (;;) { + struct fuse_io_args *ia; + struct fuse_args_pages *ap; + + nr_pages = readahead_count(rac) - nr_pages; + if (nr_pages > max_pages) + nr_pages = max_pages; + if (nr_pages == 0) + break; + ia = fuse_io_alloc(NULL, nr_pages); + if (!ia) + return; + ap = &ia->ap; + nr_pages = __readahead_batch(rac, ap->pages, nr_pages); + for (i = 0; i < nr_pages; i++) { + fuse_wait_on_page_writeback(inode, + readahead_index(rac) + i); + ap->descs[i].length = PAGE_SIZE; + } + ap->num_pages = nr_pages; + fuse_send_readpages(ia, rac->file); } -out: - return err; } static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -3373,10 +3329,10 @@ static const struct file_operations fuse_file_operations = { static const struct address_space_operations fuse_file_aops = { .readpage = fuse_readpage, + .readahead = fuse_readahead, .writepage = fuse_writepage, .writepages = fuse_writepages, .launder_page = fuse_launder_page, - .readpages = fuse_readpages, .set_page_dirty = __set_page_dirty_nobuffers, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 786c1ce8f030..72c9560f4467 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -577,7 +577,7 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, } /** - * gfs2_readpages - Read a bunch of pages at once + * gfs2_readahead - Read a bunch of pages at once * @file: The file to read from * @mapping: Address space info * @pages: List of pages to read @@ -590,31 +590,24 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, * obviously not something we'd want to do on too regular a basis. * Any I/O we ignore at this time will be done via readpage later. * 2. We don't handle stuffed files here we let readpage do the honours. - * 3. mpage_readpages() does most of the heavy lifting in the common case. + * 3. mpage_readahead() does most of the heavy lifting in the common case. * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places. */ -static int gfs2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void gfs2_readahead(struct readahead_control *rac) { - struct inode *inode = mapping->host; + struct inode *inode = rac->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_holder gh; - int ret; gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); - ret = gfs2_glock_nq(&gh); - if (unlikely(ret)) + if (gfs2_glock_nq(&gh)) goto out_uninit; if (!gfs2_is_stuffed(ip)) - ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map); + mpage_readahead(rac, gfs2_block_map); gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - if (unlikely(gfs2_withdrawn(sdp))) - ret = -EIO; - return ret; } /** @@ -833,7 +826,7 @@ static const struct address_space_operations gfs2_aops = { .writepage = gfs2_writepage, .writepages = gfs2_writepages, .readpage = gfs2_readpage, - .readpages = gfs2_readpages, + .readahead = gfs2_readahead, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, .releasepage = gfs2_releasepage, @@ -847,7 +840,7 @@ static const struct address_space_operations gfs2_jdata_aops = { .writepage = gfs2_jdata_writepage, .writepages = gfs2_jdata_writepages, .readpage = gfs2_readpage, - .readpages = gfs2_readpages, + .readahead = gfs2_readahead, .set_page_dirty = jdata_set_page_dirty, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index c3f7732415be..c0f2875c946c 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -354,7 +354,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN); if (hc == NULL) - hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL); + hc = __vmalloc(hsize, GFP_NOFS); if (hc == NULL) return ERR_PTR(-ENOMEM); @@ -1166,7 +1166,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) hc2 = kmalloc_array(hsize_bytes, 2, GFP_NOFS | __GFP_NOWARN); if (hc2 == NULL) - hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL); + hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS); if (!hc2) return -ENOMEM; @@ -1327,7 +1327,7 @@ static void *gfs2_alloc_sort_buffer(unsigned size) if (size < KMALLOC_MAX_SIZE) ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN); if (!ptr) - ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL); + ptr = __vmalloc(size, GFP_NOFS); return ptr; } @@ -1987,8 +1987,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN); if (ht == NULL) - ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO, - PAGE_KERNEL); + ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO); if (!ht) return -ENOMEM; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 48b54ec1c793..cb2a11b458c6 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -509,12 +509,12 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, unsigned int bsize = sdp->sd_sb.sb_bsize, off; unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift; unsigned int shift = PAGE_SHIFT - bsize_shift; - unsigned int max_bio_size = 2 * 1024 * 1024; + unsigned int max_blocks = 2 * 1024 * 1024 >> bsize_shift; struct gfs2_journal_extent *je; int sz, ret = 0; struct bio *bio = NULL; struct page *page = NULL; - bool bio_chained = false, done = false; + bool done = false; errseq_t since; memset(head, 0, sizeof(*head)); @@ -537,10 +537,7 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, off = 0; } - if (!bio || (bio_chained && !off) || - bio->bi_iter.bi_size >= max_bio_size) { - /* start new bio */ - } else { + if (bio && (off || block < blocks_submitted + max_blocks)) { sector_t sector = dblock << sdp->sd_fsb2bb_shift; if (bio_end_sector(bio) == sector) { @@ -553,19 +550,17 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, (PAGE_SIZE - off) >> bsize_shift; bio = gfs2_chain_bio(bio, blocks); - bio_chained = true; goto add_block_to_new_bio; } } if (bio) { - blocks_submitted = block + 1; + blocks_submitted = block; submit_bio(bio); } bio = gfs2_log_alloc_bio(sdp, dblock, gfs2_end_log_read); bio->bi_opf = REQ_OP_READ; - bio_chained = false; add_block_to_new_bio: sz = bio_add_page(bio, page, bsize, off); BUG_ON(sz != bsize); @@ -573,7 +568,7 @@ block_added: off += bsize; if (off == PAGE_SIZE) page = NULL; - if (blocks_submitted < 2 * max_bio_size >> bsize_shift) { + if (blocks_submitted <= blocks_read + max_blocks) { /* Keep at least one bio in flight */ continue; } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 8259fef3f986..4b67d47a7e00 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1365,7 +1365,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN); if (sdp->sd_quota_bitmap == NULL) sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS | - __GFP_ZERO, PAGE_KERNEL); + __GFP_ZERO); if (!sdp->sd_quota_bitmap) return error; diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig index 44f6e89bcb75..129926b5142d 100644 --- a/fs/hfs/Kconfig +++ b/fs/hfs/Kconfig @@ -6,7 +6,7 @@ config HFS_FS help If you say Y here, you will be able to mount Macintosh-formatted floppy disks and hard drive partitions with full read-write access. - Please read <file:Documentation/filesystems/hfs.txt> to learn about + Please read <file:Documentation/filesystems/hfs.rst> to learn about the available mount options. To compile this file system support as a module, choose M here: the diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index 460281b1299e..cdf0edeeb278 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -32,29 +32,35 @@ static int hfs_get_last_session(struct super_block *sb, sector_t *start, sector_t *size) { - struct cdrom_multisession ms_info; - struct cdrom_tocentry te; - int res; + struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); /* default values */ *start = 0; *size = i_size_read(sb->s_bdev->bd_inode) >> 9; if (HFS_SB(sb)->session >= 0) { + struct cdrom_tocentry te; + + if (!cdi) + return -EINVAL; + te.cdte_track = HFS_SB(sb)->session; te.cdte_format = CDROM_LBA; - res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te); - if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) { - *start = (sector_t)te.cdte_addr.lba << 2; - return 0; + if (cdrom_read_tocentry(cdi, &te) || + (te.cdte_ctrl & CDROM_DATA_TRACK) != 4) { + pr_err("invalid session number or type of track\n"); + return -EINVAL; } - pr_err("invalid session number or type of track\n"); - return -EINVAL; + + *start = (sector_t)te.cdte_addr.lba << 2; + } else if (cdi) { + struct cdrom_multisession ms_info; + + ms_info.addr_format = CDROM_LBA; + if (cdrom_multisession(cdi, &ms_info) == 0 && ms_info.xa_flag) + *start = (sector_t)ms_info.addr.lba << 2; } - ms_info.addr_format = CDROM_LBA; - res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION, (unsigned long)&ms_info); - if (!res && ms_info.xa_flag) - *start = (sector_t)ms_info.addr.lba << 2; + return 0; } diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 94bd83b36644..e3da9e96b835 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -340,7 +340,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, } if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); inode_unlock(inode); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 2b9e5743105e..129dca3f4b78 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -239,7 +239,7 @@ out: mutex_unlock(&sbi->vh_mutex); if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) - blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); + blkdev_issue_flush(sb->s_bdev, GFP_KERNEL); return error; } diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 08c1580bdf7a..61eec628805d 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -127,31 +127,34 @@ static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd) static int hfsplus_get_last_session(struct super_block *sb, sector_t *start, sector_t *size) { - struct cdrom_multisession ms_info; - struct cdrom_tocentry te; - int res; + struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); /* default values */ *start = 0; *size = i_size_read(sb->s_bdev->bd_inode) >> 9; if (HFSPLUS_SB(sb)->session >= 0) { + struct cdrom_tocentry te; + + if (!cdi) + return -EINVAL; + te.cdte_track = HFSPLUS_SB(sb)->session; te.cdte_format = CDROM_LBA; - res = ioctl_by_bdev(sb->s_bdev, - CDROMREADTOCENTRY, (unsigned long)&te); - if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) { - *start = (sector_t)te.cdte_addr.lba << 2; - return 0; + if (cdrom_read_tocentry(cdi, &te) || + (te.cdte_ctrl & CDROM_DATA_TRACK) != 4) { + pr_err("invalid session number or type of track\n"); + return -EINVAL; } - pr_err("invalid session number or type of track\n"); - return -EINVAL; + *start = (sector_t)te.cdte_addr.lba << 2; + } else if (cdi) { + struct cdrom_multisession ms_info; + + ms_info.addr_format = CDROM_LBA; + if (cdrom_multisession(cdi, &ms_info) == 0 && ms_info.xa_flag) + *start = (sector_t)ms_info.addr.lba << 2; } - ms_info.addr_format = CDROM_LBA; - res = ioctl_by_bdev(sb->s_bdev, CDROMMULTISESSION, - (unsigned long)&ms_info); - if (!res && ms_info.xa_flag) - *start = (sector_t)ms_info.addr.lba << 2; + return 0; } diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig index 56aa0336254a..2b36dc6f0a10 100644 --- a/fs/hpfs/Kconfig +++ b/fs/hpfs/Kconfig @@ -9,7 +9,7 @@ config HPFS_FS write files to an OS/2 HPFS partition on your hard drive. OS/2 floppies however are in regular MSDOS format, so you don't need this option in order to be able to read them. Read - <file:Documentation/filesystems/hpfs.txt>. + <file:Documentation/filesystems/hpfs.rst>. To compile this file system support as a module, choose M here: the module will be called hpfs. If unsure, say N. diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index b36abf9cb345..2de0d3492d15 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -125,10 +125,9 @@ static int hpfs_writepage(struct page *page, struct writeback_control *wbc) return block_write_full_page(page, hpfs_get_block, wbc); } -static int hpfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void hpfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, hpfs_get_block); + mpage_readahead(rac, hpfs_get_block); } static int hpfs_writepages(struct address_space *mapping, @@ -198,7 +197,7 @@ static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, const struct address_space_operations hpfs_aops = { .readpage = hpfs_readpage, .writepage = hpfs_writepage, - .readpages = hpfs_readpages, + .readahead = hpfs_readahead, .writepages = hpfs_writepages, .write_begin = hpfs_write_begin, .write_end = hpfs_write_end, diff --git a/fs/inode.c b/fs/inode.c index 93d9252a00ab..37226a9cfa4f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1606,14 +1606,14 @@ EXPORT_SYMBOL(iput); * @inode: inode owning the block number being requested * @block: pointer containing the block to find * - * Replaces the value in *block with the block number on the device holding + * Replaces the value in ``*block`` with the block number on the device holding * corresponding to the requested block number in the file. * That is, asked for block 4 of inode 1 the function will replace the - * 4 in *block, with disk block relative to the disk start that holds that + * 4 in ``*block``, with disk block relative to the disk start that holds that * block of the file. * * Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a - * hole, returns 0 and *block is also set to 0. + * hole, returns 0 and ``*block`` is also set to 0. */ int bmap(struct inode *inode, sector_t *block) { diff --git a/fs/internal.h b/fs/internal.h index aa5d45524e87..b89d78f10396 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -126,7 +126,6 @@ extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); -long do_faccessat(int dfd, const char __user *filename, int mode); int do_fchmodat(int dfd, const char __user *filename, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); @@ -186,5 +185,5 @@ int sb_init_dio_done_wq(struct super_block *sb); /* * fs/stat.c: */ -unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags); -int cp_statx(const struct kstat *stat, struct statx __user *buffer); +int do_statx(int dfd, const char __user *filename, unsigned flags, + unsigned int mask, struct statx __user *buffer); diff --git a/fs/io_uring.c b/fs/io_uring.c index bb25e3997d41..9d4bd0d3a080 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -142,7 +142,7 @@ struct io_rings { */ u32 sq_dropped; /* - * Runtime flags + * Runtime SQ flags * * Written by the kernel, shouldn't be modified by the * application. @@ -152,6 +152,13 @@ struct io_rings { */ u32 sq_flags; /* + * Runtime CQ flags + * + * Written by the application, shouldn't be modified by the + * kernel. + */ + u32 cq_flags; + /* * Number of completion events lost because the queue was full; * this should be avoided by the application by making sure * there are not more requests pending than there is space in @@ -191,7 +198,7 @@ struct fixed_file_ref_node { struct list_head node; struct list_head file_list; struct fixed_file_data *file_data; - struct work_struct work; + struct llist_node llist; }; struct fixed_file_data { @@ -279,8 +286,8 @@ struct io_ring_ctx { const struct cred *creds; - /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */ - struct completion *completions; + struct completion ref_comp; + struct completion sq_thread_comp; /* if all else fails... */ struct io_kiocb *fallback_req; @@ -327,6 +334,9 @@ struct io_ring_ctx { struct list_head inflight_list; } ____cacheline_aligned_in_smp; + struct delayed_work file_put_work; + struct llist_head file_put_llist; + struct work_struct exit_work; }; @@ -384,7 +394,8 @@ struct io_timeout { struct file *file; u64 addr; int flags; - u32 count; + u32 off; + u32 target_seq; }; struct io_rw { @@ -415,11 +426,7 @@ struct io_sr_msg { struct io_open { struct file *file; int dfd; - union { - unsigned mask; - }; struct filename *filename; - struct statx __user *buffer; struct open_how how; unsigned long nofile; }; @@ -471,6 +478,15 @@ struct io_provide_buf { __u16 bid; }; +struct io_statx { + struct file *file; + int dfd; + unsigned int mask; + unsigned int flags; + const char __user *filename; + struct statx __user *buffer; +}; + struct io_async_connect { struct sockaddr_storage address; }; @@ -612,11 +628,11 @@ struct io_kiocb { struct io_epoll epoll; struct io_splice splice; struct io_provide_buf pbuf; + struct io_statx statx; }; struct io_async_ctx *io; int cflags; - bool needs_fixed_file; u8 opcode; u16 buf_index; @@ -788,7 +804,6 @@ static const struct io_op_def io_op_defs[] = { .needs_fs = 1, }, [IORING_OP_CLOSE] = { - .needs_file = 1, .file_table = 1, }, [IORING_OP_FILES_UPDATE] = { @@ -847,6 +862,11 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_PROVIDE_BUFFERS] = {}, [IORING_OP_REMOVE_BUFFERS] = {}, + [IORING_OP_TEE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + }, }; static void io_wq_submit_work(struct io_wq_work **workptr); @@ -882,11 +902,18 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); +static void io_file_put_work(struct work_struct *work); + +static inline bool io_async_submit(struct io_ring_ctx *ctx) +{ + return ctx->flags & IORING_SETUP_SQPOLL; +} + static void io_ring_ctx_ref_free(struct percpu_ref *ref) { struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); - complete(&ctx->completions[0]); + complete(&ctx->ref_comp); } static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) @@ -902,10 +929,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) if (!ctx->fallback_req) goto err; - ctx->completions = kmalloc(2 * sizeof(struct completion), GFP_KERNEL); - if (!ctx->completions) - goto err; - /* * Use 5 bits less than the max cq entries, that should give us around * 32 entries per hash list if totally full and uniformly spread. @@ -929,8 +952,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->sqo_wait); init_waitqueue_head(&ctx->cq_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); - init_completion(&ctx->completions[0]); - init_completion(&ctx->completions[1]); + init_completion(&ctx->ref_comp); + init_completion(&ctx->sq_thread_comp); idr_init(&ctx->io_buffer_idr); idr_init(&ctx->personality_idr); mutex_init(&ctx->uring_lock); @@ -942,11 +965,12 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->inflight_wait); spin_lock_init(&ctx->inflight_lock); INIT_LIST_HEAD(&ctx->inflight_list); + INIT_DELAYED_WORK(&ctx->file_put_work, io_file_put_work); + init_llist_head(&ctx->file_put_llist); return ctx; err: if (ctx->fallback_req) kmem_cache_free(req_cachep, ctx->fallback_req); - kfree(ctx->completions); kfree(ctx->cancel_hash); kfree(ctx); return NULL; @@ -968,36 +992,6 @@ static inline bool req_need_defer(struct io_kiocb *req) return false; } -static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) -{ - struct io_kiocb *req; - - req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list); - if (req && !req_need_defer(req)) { - list_del_init(&req->list); - return req; - } - - return NULL; -} - -static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx) -{ - struct io_kiocb *req; - - req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list); - if (req) { - if (req->flags & REQ_F_TIMEOUT_NOSEQ) - return NULL; - if (!__req_need_defer(req)) { - list_del_init(&req->list); - return req; - } - } - - return NULL; -} - static void __io_commit_cqring(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; @@ -1113,17 +1107,43 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->completion_lock); } -static void io_commit_cqring(struct io_ring_ctx *ctx) +static void __io_queue_deferred(struct io_ring_ctx *ctx) { - struct io_kiocb *req; + do { + struct io_kiocb *req = list_first_entry(&ctx->defer_list, + struct io_kiocb, list); - while ((req = io_get_timeout_req(ctx)) != NULL) + if (req_need_defer(req)) + break; + list_del_init(&req->list); + io_queue_async_work(req); + } while (!list_empty(&ctx->defer_list)); +} + +static void io_flush_timeouts(struct io_ring_ctx *ctx) +{ + while (!list_empty(&ctx->timeout_list)) { + struct io_kiocb *req = list_first_entry(&ctx->timeout_list, + struct io_kiocb, list); + + if (req->flags & REQ_F_TIMEOUT_NOSEQ) + break; + if (req->timeout.target_seq != ctx->cached_cq_tail + - atomic_read(&ctx->cq_timeouts)) + break; + + list_del_init(&req->list); io_kill_timeout(req); + } +} +static void io_commit_cqring(struct io_ring_ctx *ctx) +{ + io_flush_timeouts(ctx); __io_commit_cqring(ctx); - while ((req = io_get_deferred_req(ctx)) != NULL) - io_queue_async_work(req); + if (unlikely(!list_empty(&ctx->defer_list))) + __io_queue_deferred(ctx); } static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) @@ -1148,6 +1168,8 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) { if (!ctx->cq_ev_fd) return false; + if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) + return false; if (!ctx->eventfd_async) return true; return io_wq_current_is_worker(); @@ -1984,15 +2006,19 @@ static void io_iopoll_req_issued(struct io_kiocb *req) wake_up(&ctx->sqo_wait); } -static void io_file_put(struct io_submit_state *state) +static void __io_state_file_put(struct io_submit_state *state) { - if (state->file) { - int diff = state->has_refs - state->used_refs; + int diff = state->has_refs - state->used_refs; - if (diff) - fput_many(state->file, diff); - state->file = NULL; - } + if (diff) + fput_many(state->file, diff); + state->file = NULL; +} + +static inline void io_state_file_put(struct io_submit_state *state) +{ + if (state->file) + __io_state_file_put(state); } /* @@ -2011,7 +2037,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) state->ios_left--; return state->file; } - io_file_put(state); + __io_state_file_put(state); } state->file = fget_many(fd, state->ios_left); if (!state->file) @@ -2727,7 +2753,8 @@ out_free: return ret; } -static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int __io_splice_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { struct io_splice* sp = &req->splice; unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; @@ -2737,8 +2764,6 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; sp->file_in = NULL; - sp->off_in = READ_ONCE(sqe->splice_off_in); - sp->off_out = READ_ONCE(sqe->off); sp->len = READ_ONCE(sqe->len); sp->flags = READ_ONCE(sqe->splice_flags); @@ -2757,6 +2782,46 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +static int io_tee_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off)) + return -EINVAL; + return __io_splice_prep(req, sqe); +} + +static int io_tee(struct io_kiocb *req, bool force_nonblock) +{ + struct io_splice *sp = &req->splice; + struct file *in = sp->file_in; + struct file *out = sp->file_out; + unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; + long ret = 0; + + if (force_nonblock) + return -EAGAIN; + if (sp->len) + ret = do_tee(in, out, sp->len, flags); + + io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); + req->flags &= ~REQ_F_NEED_CLEANUP; + + io_cqring_add_event(req, ret); + if (ret != sp->len) + req_set_fail_links(req); + io_put_req(req); + return 0; +} + +static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_splice* sp = &req->splice; + + sp->off_in = READ_ONCE(sqe->splice_off_in); + sp->off_out = READ_ONCE(sqe->off); + return __io_splice_prep(req, sqe); +} + static int io_splice(struct io_kiocb *req, bool force_nonblock) { struct io_splice *sp = &req->splice; @@ -3305,43 +3370,23 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock) static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - const char __user *fname; - unsigned lookup_flags; - int ret; - if (sqe->ioprio || sqe->buf_index) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; - if (req->flags & REQ_F_NEED_CLEANUP) - return 0; - - req->open.dfd = READ_ONCE(sqe->fd); - req->open.mask = READ_ONCE(sqe->len); - fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - req->open.how.flags = READ_ONCE(sqe->statx_flags); - - if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags)) - return -EINVAL; - req->open.filename = getname_flags(fname, lookup_flags, NULL); - if (IS_ERR(req->open.filename)) { - ret = PTR_ERR(req->open.filename); - req->open.filename = NULL; - return ret; - } + req->statx.dfd = READ_ONCE(sqe->fd); + req->statx.mask = READ_ONCE(sqe->len); + req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr)); + req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + req->statx.flags = READ_ONCE(sqe->statx_flags); - req->flags |= REQ_F_NEED_CLEANUP; return 0; } static int io_statx(struct io_kiocb *req, bool force_nonblock) { - struct io_open *ctx = &req->open; - unsigned lookup_flags; - struct path path; - struct kstat stat; + struct io_statx *ctx = &req->statx; int ret; if (force_nonblock) { @@ -3351,29 +3396,9 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock) return -EAGAIN; } - if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags)) - return -EINVAL; - -retry: - /* filename_lookup() drops it, keep a reference */ - ctx->filename->refcnt++; + ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask, + ctx->buffer); - ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path, - NULL); - if (ret) - goto err; - - ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags); - path_put(&path); - if (retry_estale(ret, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - if (!ret) - ret = cp_statx(&stat, ctx->buffer); -err: - putname(ctx->filename); - req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); @@ -3396,10 +3421,6 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EBADF; req->close.fd = READ_ONCE(sqe->fd); - if (req->file->f_op == &io_uring_fops || - req->close.fd == req->ctx->ring_fd) - return -EBADF; - return 0; } @@ -3432,21 +3453,14 @@ static int io_close(struct io_kiocb *req, bool force_nonblock) req->close.put_file = NULL; ret = __close_fd_get_file(req->close.fd, &req->close.put_file); if (ret < 0) - return ret; + return (ret == -ENOENT) ? -EBADF : ret; /* if the file has a flush method, be safe and punt to async */ if (req->close.put_file->f_op->flush && force_nonblock) { - /* submission ref will be dropped, take it for async */ - refcount_inc(&req->refs); - + /* avoid grabbing files - we don't need the files */ + req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT; req->work.func = io_close_finish; - /* - * Do manual async queue here to avoid grabbing files - we don't - * need the files, and it'll cause io_close_finish() to close - * the file again and cause a double CQE entry for this request - */ - io_queue_async_work(req); - return 0; + return -EAGAIN; } /* @@ -4096,27 +4110,6 @@ struct io_poll_table { int error; }; -static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, - struct wait_queue_head *head) -{ - if (unlikely(poll->head)) { - pt->error = -EINVAL; - return; - } - - pt->error = 0; - poll->head = head; - add_wait_queue(head, &poll->wait); -} - -static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, - struct poll_table_struct *p) -{ - struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - - __io_queue_proc(&pt->req->apoll->poll, pt, head); -} - static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, __poll_t mask, task_work_func_t func) { @@ -4170,12 +4163,150 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) return false; } +static void io_poll_remove_double(struct io_kiocb *req) +{ + struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io; + + lockdep_assert_held(&req->ctx->completion_lock); + + if (poll && poll->head) { + struct wait_queue_head *head = poll->head; + + spin_lock(&head->lock); + list_del_init(&poll->wait.entry); + if (poll->wait.private) + refcount_dec(&req->refs); + poll->head = NULL; + spin_unlock(&head->lock); + } +} + +static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) +{ + struct io_ring_ctx *ctx = req->ctx; + + io_poll_remove_double(req); + req->poll.done = true; + io_cqring_fill_event(req, error ? error : mangle_poll(mask)); + io_commit_cqring(ctx); +} + +static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (io_poll_rewait(req, &req->poll)) { + spin_unlock_irq(&ctx->completion_lock); + return; + } + + hash_del(&req->hash_node); + io_poll_complete(req, req->result, 0); + req->flags |= REQ_F_COMP_LOCKED; + io_put_req_find_next(req, nxt); + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_ev_posted(ctx); +} + +static void io_poll_task_func(struct callback_head *cb) +{ + struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_kiocb *nxt = NULL; + + io_poll_task_handler(req, &nxt); + if (nxt) { + struct io_ring_ctx *ctx = nxt->ctx; + + mutex_lock(&ctx->uring_lock); + __io_queue_sqe(nxt, NULL); + mutex_unlock(&ctx->uring_lock); + } +} + +static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, + int sync, void *key) +{ + struct io_kiocb *req = wait->private; + struct io_poll_iocb *poll = (struct io_poll_iocb *) req->io; + __poll_t mask = key_to_poll(key); + + /* for instances that support it check for an event match first: */ + if (mask && !(mask & poll->events)) + return 0; + + if (req->poll.head) { + bool done; + + spin_lock(&req->poll.head->lock); + done = list_empty(&req->poll.wait.entry); + if (!done) + list_del_init(&req->poll.wait.entry); + spin_unlock(&req->poll.head->lock); + if (!done) + __io_async_wake(req, poll, mask, io_poll_task_func); + } + refcount_dec(&req->refs); + return 1; +} + +static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, + wait_queue_func_t wake_func) +{ + poll->head = NULL; + poll->done = false; + poll->canceled = false; + poll->events = events; + INIT_LIST_HEAD(&poll->wait.entry); + init_waitqueue_func_entry(&poll->wait, wake_func); +} + +static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, + struct wait_queue_head *head) +{ + struct io_kiocb *req = pt->req; + + /* + * If poll->head is already set, it's because the file being polled + * uses multiple waitqueues for poll handling (eg one for read, one + * for write). Setup a separate io_poll_iocb if this happens. + */ + if (unlikely(poll->head)) { + /* already have a 2nd entry, fail a third attempt */ + if (req->io) { + pt->error = -EINVAL; + return; + } + poll = kmalloc(sizeof(*poll), GFP_ATOMIC); + if (!poll) { + pt->error = -ENOMEM; + return; + } + io_init_poll_iocb(poll, req->poll.events, io_poll_double_wake); + refcount_inc(&req->refs); + poll->wait.private = req; + req->io = (void *) poll; + } + + pt->error = 0; + poll->head = head; + add_wait_queue(head, &poll->wait); +} + +static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, + struct poll_table_struct *p) +{ + struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + + __io_queue_proc(&pt->req->apoll->poll, pt, head); +} + static void io_async_task_func(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct async_poll *apoll = req->apoll; struct io_ring_ctx *ctx = req->ctx; - bool canceled; + bool canceled = false; trace_io_uring_task_run(req->ctx, req->opcode, req->user_data); @@ -4184,34 +4315,33 @@ static void io_async_task_func(struct callback_head *cb) return; } - if (hash_hashed(&req->hash_node)) + /* If req is still hashed, it cannot have been canceled. Don't check. */ + if (hash_hashed(&req->hash_node)) { hash_del(&req->hash_node); - - canceled = READ_ONCE(apoll->poll.canceled); - if (canceled) { - io_cqring_fill_event(req, -ECANCELED); - io_commit_cqring(ctx); + } else { + canceled = READ_ONCE(apoll->poll.canceled); + if (canceled) { + io_cqring_fill_event(req, -ECANCELED); + io_commit_cqring(ctx); + } } spin_unlock_irq(&ctx->completion_lock); /* restore ->work in case we need to retry again */ memcpy(&req->work, &apoll->work, sizeof(req->work)); + kfree(apoll); - if (canceled) { - kfree(apoll); + if (!canceled) { + __set_current_state(TASK_RUNNING); + mutex_lock(&ctx->uring_lock); + __io_queue_sqe(req, NULL); + mutex_unlock(&ctx->uring_lock); + } else { io_cqring_ev_posted(ctx); req_set_fail_links(req); io_double_put_req(req); - return; } - - __set_current_state(TASK_RUNNING); - mutex_lock(&ctx->uring_lock); - __io_queue_sqe(req, NULL); - mutex_unlock(&ctx->uring_lock); - - kfree(apoll); } static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -4245,18 +4375,13 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, bool cancel = false; poll->file = req->file; - poll->head = NULL; - poll->done = poll->canceled = false; - poll->events = mask; + io_init_poll_iocb(poll, mask, wake_func); + poll->wait.private = req; ipt->pt._key = mask; ipt->req = req; ipt->error = -EINVAL; - INIT_LIST_HEAD(&poll->wait.entry); - init_waitqueue_func_entry(&poll->wait, wake_func); - poll->wait.private = req; - mask = vfs_poll(req->file, &ipt->pt) & poll->events; spin_lock_irq(&ctx->completion_lock); @@ -4287,6 +4412,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req) struct async_poll *apoll; struct io_poll_table ipt; __poll_t mask, ret; + bool had_io; if (!req->file || !file_can_poll(req->file)) return false; @@ -4301,6 +4427,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req) req->flags |= REQ_F_POLLED; memcpy(&apoll->work, &req->work, sizeof(req->work)); + had_io = req->io != NULL; get_task_struct(current); req->task = current; @@ -4320,7 +4447,9 @@ static bool io_arm_poll_handler(struct io_kiocb *req) io_async_wake); if (ret) { ipt.error = 0; - apoll->poll.done = true; + /* only remove double add if we did it here */ + if (!had_io) + io_poll_remove_double(req); spin_unlock_irq(&ctx->completion_lock); memcpy(&req->work, &apoll->work, sizeof(req->work)); kfree(apoll); @@ -4344,32 +4473,32 @@ static bool __io_poll_remove_one(struct io_kiocb *req, do_complete = true; } spin_unlock(&poll->head->lock); + hash_del(&req->hash_node); return do_complete; } static bool io_poll_remove_one(struct io_kiocb *req) { - struct async_poll *apoll = NULL; bool do_complete; if (req->opcode == IORING_OP_POLL_ADD) { + io_poll_remove_double(req); do_complete = __io_poll_remove_one(req, &req->poll); } else { - apoll = req->apoll; + struct async_poll *apoll = req->apoll; + /* non-poll requests have submit ref still */ - do_complete = __io_poll_remove_one(req, &req->apoll->poll); - if (do_complete) + do_complete = __io_poll_remove_one(req, &apoll->poll); + if (do_complete) { io_put_req(req); - } - - hash_del(&req->hash_node); - - if (do_complete && apoll) { - /* - * restore ->work because we need to call io_req_work_drop_env. - */ - memcpy(&req->work, &apoll->work, sizeof(req->work)); - kfree(apoll); + /* + * restore ->work because we will call + * io_req_work_drop_env below when dropping the + * final reference. + */ + memcpy(&req->work, &apoll->work, sizeof(req->work)); + kfree(apoll); + } } if (do_complete) { @@ -4454,49 +4583,6 @@ static int io_poll_remove(struct io_kiocb *req) return 0; } -static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) -{ - struct io_ring_ctx *ctx = req->ctx; - - req->poll.done = true; - io_cqring_fill_event(req, error ? error : mangle_poll(mask)); - io_commit_cqring(ctx); -} - -static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_poll_iocb *poll = &req->poll; - - if (io_poll_rewait(req, poll)) { - spin_unlock_irq(&ctx->completion_lock); - return; - } - - hash_del(&req->hash_node); - io_poll_complete(req, req->result, 0); - req->flags |= REQ_F_COMP_LOCKED; - io_put_req_find_next(req, nxt); - spin_unlock_irq(&ctx->completion_lock); - - io_cqring_ev_posted(ctx); -} - -static void io_poll_task_func(struct callback_head *cb) -{ - struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); - struct io_kiocb *nxt = NULL; - - io_poll_task_handler(req, &nxt); - if (nxt) { - struct io_ring_ctx *ctx = nxt->ctx; - - mutex_lock(&ctx->uring_lock); - __io_queue_sqe(nxt, NULL); - mutex_unlock(&ctx->uring_lock); - } -} - static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { @@ -4576,20 +4662,8 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) * We could be racing with timeout deletion. If the list is empty, * then timeout lookup already found it and will be handling it. */ - if (!list_empty(&req->list)) { - struct io_kiocb *prev; - - /* - * Adjust the reqs sequence before the current one because it - * will consume a slot in the cq_ring and the cq_tail - * pointer will be increased, otherwise other timeout reqs may - * return in advance without waiting for enough wait_nr. - */ - prev = req; - list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list) - prev->sequence++; + if (!list_empty(&req->list)) list_del_init(&req->list); - } io_cqring_fill_event(req, -ETIME); io_commit_cqring(ctx); @@ -4669,18 +4743,19 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, { struct io_timeout_data *data; unsigned flags; + u32 off = READ_ONCE(sqe->off); if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->buf_index || sqe->len != 1) return -EINVAL; - if (sqe->off && is_timeout_link) + if (off && is_timeout_link) return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); if (flags & ~IORING_TIMEOUT_ABS) return -EINVAL; - req->timeout.count = READ_ONCE(sqe->off); + req->timeout.off = off; if (!req->io && io_alloc_async_ctx(req)) return -ENOMEM; @@ -4704,68 +4779,39 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, static int io_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_timeout_data *data; + struct io_timeout_data *data = &req->io->timeout; struct list_head *entry; - unsigned span = 0; - u32 count = req->timeout.count; - u32 seq = req->sequence; + u32 tail, off = req->timeout.off; - data = &req->io->timeout; + spin_lock_irq(&ctx->completion_lock); /* * sqe->off holds how many events that need to occur for this * timeout event to be satisfied. If it isn't set, then this is * a pure timeout request, sequence isn't used. */ - if (!count) { + if (!off) { req->flags |= REQ_F_TIMEOUT_NOSEQ; - spin_lock_irq(&ctx->completion_lock); entry = ctx->timeout_list.prev; goto add; } - req->sequence = seq + count; + tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); + req->timeout.target_seq = tail + off; /* * Insertion sort, ensuring the first entry in the list is always * the one we need first. */ - spin_lock_irq(&ctx->completion_lock); list_for_each_prev(entry, &ctx->timeout_list) { struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); - unsigned nxt_seq; - long long tmp, tmp_nxt; - u32 nxt_offset = nxt->timeout.count; if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) continue; - - /* - * Since seq + count can overflow, use type long - * long to store it. - */ - tmp = (long long)seq + count; - nxt_seq = nxt->sequence - nxt_offset; - tmp_nxt = (long long)nxt_seq + nxt_offset; - - /* - * cached_sq_head may overflow, and it will never overflow twice - * once there is some timeout req still be valid. - */ - if (seq < nxt_seq) - tmp += UINT_MAX; - - if (tmp > tmp_nxt) + /* nxt.seq is behind @tail, otherwise would've been completed */ + if (off >= nxt->timeout.target_seq - tail) break; - - /* - * Sequence of reqs after the insert one and itself should - * be adjusted because each timeout req consumes a slot. - */ - span++; - nxt->sequence++; } - req->sequence -= span; add: list_add(&req->list, entry); data->timer.function = io_timeout_fn; @@ -4994,6 +5040,9 @@ static int io_req_defer_prep(struct io_kiocb *req, case IORING_OP_REMOVE_BUFFERS: ret = io_remove_buffers_prep(req, sqe); break; + case IORING_OP_TEE: + ret = io_tee_prep(req, sqe); + break; default: printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", req->opcode); @@ -5064,10 +5113,9 @@ static void io_cleanup_req(struct io_kiocb *req) break; case IORING_OP_OPENAT: case IORING_OP_OPENAT2: - case IORING_OP_STATX: - putname(req->open.filename); break; case IORING_OP_SPLICE: + case IORING_OP_TEE: io_put_file(req, req->splice.file_in, (req->splice.flags & SPLICE_F_FD_IN_FIXED)); break; @@ -5298,6 +5346,14 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, } ret = io_remove_buffers(req, force_nonblock); break; + case IORING_OP_TEE: + if (sqe) { + ret = io_tee_prep(req, sqe); + if (ret < 0) + break; + } + ret = io_tee(req, force_nonblock); + break; default: ret = -EINVAL; break; @@ -5367,7 +5423,7 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, struct fixed_file_table *table; table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT]; - return table->files[index & IORING_FILE_TABLE_MASK];; + return table->files[index & IORING_FILE_TABLE_MASK]; } static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, @@ -5403,7 +5459,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, bool fixed; fixed = (req->flags & REQ_F_FIXED_FILE) != 0; - if (unlikely(!fixed && req->needs_fixed_file)) + if (unlikely(!fixed && io_async_submit(req->ctx))) return -EBADF; return io_file_get(state, req, fd, &req->file, fixed); @@ -5638,7 +5694,7 @@ static inline void io_queue_link_head(struct io_kiocb *req) } static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_submit_state *state, struct io_kiocb **link) + struct io_kiocb **link) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -5711,7 +5767,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, static void io_submit_state_end(struct io_submit_state *state) { blk_finish_plug(&state->plug); - io_file_put(state); + io_state_file_put(state); if (state->free_reqs) kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); } @@ -5782,7 +5838,7 @@ static inline void io_consume_sqe(struct io_ring_ctx *ctx) static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_submit_state *state, bool async) + struct io_submit_state *state) { unsigned int sqe_flags; int id; @@ -5803,7 +5859,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, refcount_set(&req->refs, 2); req->task = NULL; req->result = 0; - req->needs_fixed_file = async; INIT_IO_WORK(&req->work, io_wq_submit_work); if (unlikely(req->opcode >= IORING_OP_LAST)) @@ -5833,9 +5888,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, } /* same numerical values with corresponding REQ_F_*, safe to copy */ - req->flags |= sqe_flags & (IOSQE_IO_DRAIN | IOSQE_IO_HARDLINK | - IOSQE_ASYNC | IOSQE_FIXED_FILE | - IOSQE_BUFFER_SELECT | IOSQE_IO_LINK); + req->flags |= sqe_flags; if (!io_op_defs[req->opcode].needs_file) return 0; @@ -5844,7 +5897,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, } static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, - struct file *ring_file, int ring_fd, bool async) + struct file *ring_file, int ring_fd) { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; @@ -5888,7 +5941,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, break; } - err = io_init_req(ctx, req, sqe, statep, async); + err = io_init_req(ctx, req, sqe, statep); io_consume_sqe(ctx); /* will complete beyond this point, count as submitted */ submitted++; @@ -5901,8 +5954,8 @@ fail_req: } trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, - true, async); - err = io_submit_sqe(req, sqe, statep, &link); + true, io_async_submit(ctx)); + err = io_submit_sqe(req, sqe, &link); if (err) goto fail_req; } @@ -5942,7 +5995,7 @@ static int io_sq_thread(void *data) unsigned long timeout; int ret = 0; - complete(&ctx->completions[1]); + complete(&ctx->sq_thread_comp); old_fs = get_fs(); set_fs(USER_DS); @@ -6041,7 +6094,8 @@ static int io_sq_thread(void *data) } mutex_lock(&ctx->uring_lock); - ret = io_submit_sqes(ctx, to_submit, NULL, -1, true); + if (likely(!percpu_ref_is_dying(&ctx->refs))) + ret = io_submit_sqes(ctx, to_submit, NULL, -1); mutex_unlock(&ctx->uring_lock); timeout = jiffies + ctx->sq_thread_idle; } @@ -6189,22 +6243,22 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) struct fixed_file_data *data = ctx->file_data; struct fixed_file_ref_node *ref_node = NULL; unsigned nr_tables, i; - unsigned long flags; if (!data) return -ENXIO; - spin_lock_irqsave(&data->lock, flags); + spin_lock(&data->lock); if (!list_empty(&data->ref_list)) ref_node = list_first_entry(&data->ref_list, struct fixed_file_ref_node, node); - spin_unlock_irqrestore(&data->lock, flags); + spin_unlock(&data->lock); if (ref_node) percpu_ref_kill(&ref_node->refs); percpu_ref_kill(&data->refs); /* wait for all refs nodes to complete */ + flush_delayed_work(&ctx->file_put_work); wait_for_completion(&data->done); __io_sqe_files_unregister(ctx); @@ -6222,7 +6276,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) static void io_sq_thread_stop(struct io_ring_ctx *ctx) { if (ctx->sqo_thread) { - wait_for_completion(&ctx->completions[1]); + wait_for_completion(&ctx->sq_thread_comp); /* * The park is a bit of a work-around, without it we get * warning spews on shutdown with SQPOLL set and affinity @@ -6435,40 +6489,63 @@ struct io_file_put { struct file *file; }; -static void io_file_put_work(struct work_struct *work) +static void __io_file_put_work(struct fixed_file_ref_node *ref_node) { - struct fixed_file_ref_node *ref_node; - struct fixed_file_data *file_data; - struct io_ring_ctx *ctx; + struct fixed_file_data *file_data = ref_node->file_data; + struct io_ring_ctx *ctx = file_data->ctx; struct io_file_put *pfile, *tmp; - unsigned long flags; - - ref_node = container_of(work, struct fixed_file_ref_node, work); - file_data = ref_node->file_data; - ctx = file_data->ctx; list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) { - list_del_init(&pfile->list); + list_del(&pfile->list); io_ring_file_put(ctx, pfile->file); kfree(pfile); } - spin_lock_irqsave(&file_data->lock, flags); - list_del_init(&ref_node->node); - spin_unlock_irqrestore(&file_data->lock, flags); + spin_lock(&file_data->lock); + list_del(&ref_node->node); + spin_unlock(&file_data->lock); percpu_ref_exit(&ref_node->refs); kfree(ref_node); percpu_ref_put(&file_data->refs); } +static void io_file_put_work(struct work_struct *work) +{ + struct io_ring_ctx *ctx; + struct llist_node *node; + + ctx = container_of(work, struct io_ring_ctx, file_put_work.work); + node = llist_del_all(&ctx->file_put_llist); + + while (node) { + struct fixed_file_ref_node *ref_node; + struct llist_node *next = node->next; + + ref_node = llist_entry(node, struct fixed_file_ref_node, llist); + __io_file_put_work(ref_node); + node = next; + } +} + static void io_file_data_ref_zero(struct percpu_ref *ref) { struct fixed_file_ref_node *ref_node; + struct io_ring_ctx *ctx; + bool first_add; + int delay = HZ; ref_node = container_of(ref, struct fixed_file_ref_node, refs); + ctx = ref_node->file_data->ctx; + + if (percpu_ref_is_dying(&ctx->file_data->refs)) + delay = 0; - queue_work(system_wq, &ref_node->work); + first_add = llist_add(&ref_node->llist, &ctx->file_put_llist); + if (!delay) + mod_delayed_work(system_wq, &ctx->file_put_work, 0); + else if (first_add) + queue_delayed_work(system_wq, &ctx->file_put_work, delay); } static struct fixed_file_ref_node *alloc_fixed_file_ref_node( @@ -6487,10 +6564,8 @@ static struct fixed_file_ref_node *alloc_fixed_file_ref_node( } INIT_LIST_HEAD(&ref_node->node); INIT_LIST_HEAD(&ref_node->file_list); - INIT_WORK(&ref_node->work, io_file_put_work); ref_node->file_data = ctx->file_data; return ref_node; - } static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node) @@ -6508,7 +6583,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, int fd, ret = 0; unsigned i; struct fixed_file_ref_node *ref_node; - unsigned long flags; if (ctx->file_data) return -EBUSY; @@ -6616,9 +6690,9 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, } ctx->file_data->cur_refs = &ref_node->refs; - spin_lock_irqsave(&ctx->file_data->lock, flags); + spin_lock(&ctx->file_data->lock); list_add(&ref_node->node, &ctx->file_data->ref_list); - spin_unlock_irqrestore(&ctx->file_data->lock, flags); + spin_unlock(&ctx->file_data->lock); percpu_ref_get(&ctx->file_data->refs); return ret; } @@ -6694,7 +6768,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, __s32 __user *fds; int fd, i, err; __u32 done; - unsigned long flags; bool needs_switch = false; if (check_add_overflow(up->offset, nr_args, &done)) @@ -6759,10 +6832,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (needs_switch) { percpu_ref_kill(data->cur_refs); - spin_lock_irqsave(&data->lock, flags); + spin_lock(&data->lock); list_add(&ref_node->node, &data->ref_list); data->cur_refs = &ref_node->refs; - spin_unlock_irqrestore(&data->lock, flags); + spin_unlock(&data->lock); percpu_ref_get(&ctx->file_data->refs); } else destroy_fixed_file_ref_node(ref_node); @@ -7250,7 +7323,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) ring_pages(ctx->sq_entries, ctx->cq_entries)); free_uid(ctx->user); put_cred(ctx->creds); - kfree(ctx->completions); kfree(ctx->cancel_hash); kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx); @@ -7302,7 +7374,7 @@ static void io_ring_exit_work(struct work_struct *work) if (ctx->rings) io_cqring_overflow_flush(ctx, true); - wait_for_completion(&ctx->completions[0]); + wait_for_completion(&ctx->ref_comp); io_ring_ctx_free(ctx); } @@ -7312,16 +7384,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); mutex_unlock(&ctx->uring_lock); - /* - * Wait for sq thread to idle, if we have one. It won't spin on new - * work after we've killed the ctx ref above. This is important to do - * before we cancel existing commands, as the thread could otherwise - * be queueing new work post that. If that's work we need to cancel, - * it could cause shutdown to hang. - */ - while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait)) - cond_resched(); - io_kill_timeouts(ctx); io_poll_remove_all(ctx); @@ -7390,14 +7452,15 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, * all we had, then we're done with this request. */ if (refcount_sub_and_test(2, &cancel_req->refs)) { - io_put_req(cancel_req); + io_free_req(cancel_req); finish_wait(&ctx->inflight_wait, &wait); continue; } + } else { + io_wq_cancel_work(ctx->io_wq, &cancel_req->work); + io_put_req(cancel_req); } - io_wq_cancel_work(ctx->io_wq, &cancel_req->work); - io_put_req(cancel_req); schedule(); finish_wait(&ctx->inflight_wait, &wait); } @@ -7530,7 +7593,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, submitted = to_submit; } else if (to_submit) { mutex_lock(&ctx->uring_lock); - submitted = io_submit_sqes(ctx, to_submit, f.file, fd, false); + submitted = io_submit_sqes(ctx, to_submit, f.file, fd); mutex_unlock(&ctx->uring_lock); if (submitted != to_submit) @@ -7841,6 +7904,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); p->cq_off.cqes = offsetof(struct io_rings, cqes); + p->cq_off.flags = offsetof(struct io_rings, cq_flags); p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | @@ -8001,7 +8065,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, * after we've killed the percpu ref. */ mutex_unlock(&ctx->uring_lock); - ret = wait_for_completion_interruptible(&ctx->completions[0]); + ret = wait_for_completion_interruptible(&ctx->ref_comp); mutex_lock(&ctx->uring_lock); if (ret) { percpu_ref_resurrect(&ctx->refs); @@ -8078,7 +8142,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, /* bring the ctx back to life */ percpu_ref_reinit(&ctx->refs); out: - reinit_completion(&ctx->completions[0]); + reinit_completion(&ctx->ref_comp); } return ret; } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 89e21961d1ad..a1ed7620fbac 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -59,24 +59,19 @@ iomap_page_create(struct inode *inode, struct page *page) * migrate_page_move_mapping() assumes that pages with private data have * their count elevated by 1. */ - get_page(page); - set_page_private(page, (unsigned long)iop); - SetPagePrivate(page); + attach_page_private(page, iop); return iop; } static void iomap_page_release(struct page *page) { - struct iomap_page *iop = to_iomap_page(page); + struct iomap_page *iop = detach_page_private(page); if (!iop) return; WARN_ON_ONCE(atomic_read(&iop->read_count)); WARN_ON_ONCE(atomic_read(&iop->write_count)); - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); kfree(iop); } @@ -214,9 +209,8 @@ iomap_read_end_io(struct bio *bio) struct iomap_readpage_ctx { struct page *cur_page; bool cur_page_in_bio; - bool is_readahead; struct bio *bio; - struct list_head *pages; + struct readahead_control *rac; }; static void @@ -308,7 +302,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (ctx->bio) submit_bio(ctx->bio); - if (ctx->is_readahead) /* same as readahead_gfp_mask */ + if (ctx->rac) /* same as readahead_gfp_mask */ gfp |= __GFP_NORETRY | __GFP_NOWARN; ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs)); /* @@ -319,7 +313,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (!ctx->bio) ctx->bio = bio_alloc(orig_gfp, 1); ctx->bio->bi_opf = REQ_OP_READ; - if (ctx->is_readahead) + if (ctx->rac) ctx->bio->bi_opf |= REQ_RAHEAD; ctx->bio->bi_iter.bi_sector = sector; bio_set_dev(ctx->bio, iomap->bdev); @@ -367,7 +361,7 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) } /* - * Just like mpage_readpages and block_read_full_page we always + * Just like mpage_readahead and block_read_full_page we always * return 0 and just mark the page as PageError on errors. This * should be cleaned up all through the stack eventually. */ @@ -375,36 +369,8 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_readpage); -static struct page * -iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, - loff_t length, loff_t *done) -{ - while (!list_empty(pages)) { - struct page *page = lru_to_page(pages); - - if (page_offset(page) >= (u64)pos + length) - break; - - list_del(&page->lru); - if (!add_to_page_cache_lru(page, inode->i_mapping, page->index, - GFP_NOFS)) - return page; - - /* - * If we already have a page in the page cache at index we are - * done. Upper layers don't care if it is uptodate after the - * readpages call itself as every page gets checked again once - * actually needed. - */ - *done += PAGE_SIZE; - put_page(page); - } - - return NULL; -} - static loff_t -iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, +iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap, struct iomap *srcmap) { struct iomap_readpage_ctx *ctx = data; @@ -418,10 +384,7 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, ctx->cur_page = NULL; } if (!ctx->cur_page) { - ctx->cur_page = iomap_next_page(inode, ctx->pages, - pos, length, &done); - if (!ctx->cur_page) - break; + ctx->cur_page = readahead_page(ctx->rac); ctx->cur_page_in_bio = false; } ret = iomap_readpage_actor(inode, pos + done, length - done, @@ -431,32 +394,43 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, return done; } -int -iomap_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, const struct iomap_ops *ops) +/** + * iomap_readahead - Attempt to read pages from a file. + * @rac: Describes the pages to be read. + * @ops: The operations vector for the filesystem. + * + * This function is for filesystems to call to implement their readahead + * address_space operation. + * + * Context: The @ops callbacks may submit I/O (eg to read the addresses of + * blocks from disc), and may wait for it. The caller may be trying to + * access a different page, and so sleeping excessively should be avoided. + * It may allocate memory, but should avoid costly allocations. This + * function is called with memalloc_nofs set, so allocations will not cause + * the filesystem to be reentered. + */ +void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) { + struct inode *inode = rac->mapping->host; + loff_t pos = readahead_pos(rac); + loff_t length = readahead_length(rac); struct iomap_readpage_ctx ctx = { - .pages = pages, - .is_readahead = true, + .rac = rac, }; - loff_t pos = page_offset(list_entry(pages->prev, struct page, lru)); - loff_t last = page_offset(list_entry(pages->next, struct page, lru)); - loff_t length = last - pos + PAGE_SIZE, ret = 0; - trace_iomap_readpages(mapping->host, nr_pages); + trace_iomap_readahead(inode, readahead_count(rac)); while (length > 0) { - ret = iomap_apply(mapping->host, pos, length, 0, ops, - &ctx, iomap_readpages_actor); + loff_t ret = iomap_apply(inode, pos, length, 0, ops, + &ctx, iomap_readahead_actor); if (ret <= 0) { WARN_ON_ONCE(ret == 0); - goto done; + break; } pos += ret; length -= ret; } - ret = 0; -done: + if (ctx.bio) submit_bio(ctx.bio); if (ctx.cur_page) { @@ -464,15 +438,8 @@ done: unlock_page(ctx.cur_page); put_page(ctx.cur_page); } - - /* - * Check that we didn't lose a page due to the arcance calling - * conventions.. - */ - WARN_ON_ONCE(!ret && !list_empty(ctx.pages)); - return ret; } -EXPORT_SYMBOL_GPL(iomap_readpages); +EXPORT_SYMBOL_GPL(iomap_readahead); /* * iomap_is_partially_uptodate checks whether blocks within a page are @@ -554,14 +521,8 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage, if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (page_has_private(page)) { - ClearPagePrivate(page); - get_page(newpage); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - SetPagePrivate(newpage); - } + if (page_has_private(page)) + attach_page_private(newpage, detach_page_private(page)); if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index e4addfc58107..ec7b78e6feca 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -564,7 +564,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, !dio->submit.last_queue || !blk_poll(dio->submit.last_queue, dio->submit.cookie, true)) - io_schedule(); + blk_io_schedule(); } __set_current_state(TASK_RUNNING); } diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 4df19c66f597..5693a39d52fb 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -39,7 +39,7 @@ DEFINE_EVENT(iomap_readpage_class, name, \ TP_PROTO(struct inode *inode, int nr_pages), \ TP_ARGS(inode, nr_pages)) DEFINE_READPAGE_EVENT(iomap_readpage); -DEFINE_READPAGE_EVENT(iomap_readpages); +DEFINE_READPAGE_EVENT(iomap_readahead); DECLARE_EVENT_CLASS(iomap_range_class, TP_PROTO(struct inode *inode, unsigned long off, unsigned int len), diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig index 5e7419599f50..08ffd37b9bb8 100644 --- a/fs/isofs/Kconfig +++ b/fs/isofs/Kconfig @@ -8,7 +8,7 @@ config ISO9660_FS long Unix filenames and symbolic links are also supported by this driver. If you have a CD-ROM drive and want to do more with it than just listen to audio CDs and watch its LEDs, say Y (and read - <file:Documentation/filesystems/isofs.txt> and the CD-ROM-HOWTO, + <file:Documentation/filesystems/isofs.rst> and the CD-ROM-HOWTO, available from <http://www.tldp.org/docs.html#howto>), thereby enlarging your kernel by about 27 KB; otherwise say N. diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 62c0462dc89f..d634561f871a 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -544,43 +544,41 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root) static unsigned int isofs_get_last_session(struct super_block *sb, s32 session) { - struct cdrom_multisession ms_info; - unsigned int vol_desc_start; - struct block_device *bdev = sb->s_bdev; - int i; + struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); + unsigned int vol_desc_start = 0; - vol_desc_start=0; - ms_info.addr_format=CDROM_LBA; if (session > 0) { - struct cdrom_tocentry Te; - Te.cdte_track=session; - Te.cdte_format=CDROM_LBA; - i = ioctl_by_bdev(bdev, CDROMREADTOCENTRY, (unsigned long) &Te); - if (!i) { + struct cdrom_tocentry te; + + if (!cdi) + return 0; + + te.cdte_track = session; + te.cdte_format = CDROM_LBA; + if (cdrom_read_tocentry(cdi, &te) == 0) { printk(KERN_DEBUG "ISOFS: Session %d start %d type %d\n", - session, Te.cdte_addr.lba, - Te.cdte_ctrl&CDROM_DATA_TRACK); - if ((Te.cdte_ctrl&CDROM_DATA_TRACK) == 4) - return Te.cdte_addr.lba; + session, te.cdte_addr.lba, + te.cdte_ctrl & CDROM_DATA_TRACK); + if ((te.cdte_ctrl & CDROM_DATA_TRACK) == 4) + return te.cdte_addr.lba; } printk(KERN_ERR "ISOFS: Invalid session number or type of track\n"); } - i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long) &ms_info); - if (session > 0) - printk(KERN_ERR "ISOFS: Invalid session number\n"); -#if 0 - printk(KERN_DEBUG "isofs.inode: CDROMMULTISESSION: rc=%d\n",i); - if (i==0) { - printk(KERN_DEBUG "isofs.inode: XA disk: %s\n",ms_info.xa_flag?"yes":"no"); - printk(KERN_DEBUG "isofs.inode: vol_desc_start = %d\n", ms_info.addr.lba); - } -#endif - if (i==0) + + if (cdi) { + struct cdrom_multisession ms_info; + + ms_info.addr_format = CDROM_LBA; + if (cdrom_multisession(cdi, &ms_info) == 0) { #if WE_OBEY_THE_WRITTEN_STANDARDS - if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ + /* necessary for a valid ms_info.addr */ + if (ms_info.xa_flag) #endif - vol_desc_start=ms_info.addr.lba; + vol_desc_start = ms_info.addr.lba; + } + } + return vol_desc_start; } @@ -1185,10 +1183,9 @@ static int isofs_readpage(struct file *file, struct page *page) return mpage_readpage(page, isofs_get_block); } -static int isofs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void isofs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, isofs_get_block); + mpage_readahead(rac, isofs_get_block); } static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) @@ -1198,7 +1195,7 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block) static const struct address_space_operations isofs_aops = { .readpage = isofs_readpage, - .readpages = isofs_readpages, + .readahead = isofs_readahead, .bmap = _isofs_bmap }; diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 96bf33986d03..263f02ad8ebf 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -414,7 +414,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * jbd2_cleanup_journal_tail() doesn't get called all that often. */ if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); return __jbd2_update_log_tail(journal, first_tid, blocknr); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index e855d8260433..6d2da8ad0e6f 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -775,7 +775,7 @@ start_journal_io: if (commit_transaction->t_need_data_flush && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS); /* Done it all: now write the commit record asynchronously. */ if (jbd2_has_feature_async_commit(journal)) { @@ -882,7 +882,7 @@ start_journal_io: stats.run.rs_blocks_logged++; if (jbd2_has_feature_async_commit(journal) && journal->j_flags & JBD2_BARRIER) { - blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL); + blkdev_issue_flush(journal->j_dev, GFP_NOFS); } if (err) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index a4967b27ffb6..2ed278f0dced 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -286,7 +286,7 @@ int jbd2_journal_recover(journal_t *journal) err = err2; /* Make sure all replayed data is on permanent storage */ if (journal->j_flags & JBD2_BARRIER) { - err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL); if (!err) err = err2; } diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 9486afcdac76..6f65bfa9f18d 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -296,10 +296,9 @@ static int jfs_readpage(struct file *file, struct page *page) return mpage_readpage(page, jfs_get_block); } -static int jfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void jfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, jfs_get_block); + mpage_readahead(rac, jfs_get_block); } static void jfs_write_failed(struct address_space *mapping, loff_t to) @@ -358,7 +357,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) const struct address_space_operations jfs_aops = { .readpage = jfs_readpage, - .readpages = jfs_readpages, + .readahead = jfs_readahead, .writepage = jfs_writepage, .writepages = jfs_writepages, .write_begin = jfs_write_begin, diff --git a/fs/libfs.c b/fs/libfs.c index 3759fbacf522..4d08edf19c78 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1113,7 +1113,7 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end, err = __generic_file_fsync(file, start, end, datasync); if (err) return err; - return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); } EXPORT_SYMBOL(generic_file_fsync); diff --git a/fs/locks.c b/fs/locks.c index b8a31c1c4fff..1d4f4d5da704 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -61,7 +61,7 @@ * * Initial implementation of mandatory locks. SunOS turned out to be * a rotten model, so I implemented the "obvious" semantics. - * See 'Documentation/filesystems/mandatory-locking.txt' for details. + * See 'Documentation/filesystems/mandatory-locking.rst' for details. * Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996. * * Don't allow mandatory locks on mmap()'ed files. Added simple functions to diff --git a/fs/mount.h b/fs/mount.h index 711a4093e475..c7abb7b394d8 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -9,7 +9,13 @@ struct mnt_namespace { atomic_t count; struct ns_common ns; struct mount * root; + /* + * Traversal and modification of .list is protected by either + * - taking namespace_sem for write, OR + * - taking namespace_sem for read AND taking .ns_lock. + */ struct list_head list; + spinlock_t ns_lock; struct user_namespace *user_ns; struct ucounts *ucounts; u64 seq; /* Sequence number to prevent loops */ @@ -133,9 +139,7 @@ struct proc_mounts { struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); - void *cached_mount; - u64 cached_event; - loff_t cached_index; + struct mount cursor; }; extern const struct seq_operations mounts_op; @@ -153,3 +157,5 @@ static inline bool is_anon_ns(struct mnt_namespace *ns) { return ns->seq == 0; } + +extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor); diff --git a/fs/mpage.c b/fs/mpage.c index ccba3c4c4479..830e6cc2a9e7 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -91,7 +91,7 @@ mpage_alloc(struct block_device *bdev, } /* - * support function for mpage_readpages. The fs supplied get_block might + * support function for mpage_readahead. The fs supplied get_block might * return an up to date buffer. This is used to map that buffer into * the page, which allows readpage to avoid triggering a duplicate call * to get_block. @@ -338,13 +338,8 @@ confused: } /** - * mpage_readpages - populate an address space with some pages & start reads against them - * @mapping: the address_space - * @pages: The address of a list_head which contains the target pages. These - * pages have their ->index populated and are otherwise uninitialised. - * The page at @pages->prev has the lowest file offset, and reads should be - * issued in @pages->prev to @pages->next order. - * @nr_pages: The number of pages at *@pages + * mpage_readahead - start reads against pages + * @rac: Describes which pages to read. * @get_block: The filesystem's block mapper function. * * This function walks the pages and the blocks within each page, building and @@ -381,36 +376,25 @@ confused: * * This all causes the disk requests to be issued in the correct order. */ -int -mpage_readpages(struct address_space *mapping, struct list_head *pages, - unsigned nr_pages, get_block_t get_block) +void mpage_readahead(struct readahead_control *rac, get_block_t get_block) { + struct page *page; struct mpage_readpage_args args = { .get_block = get_block, .is_readahead = true, }; - unsigned page_idx; - - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = lru_to_page(pages); + while ((page = readahead_page(rac))) { prefetchw(&page->flags); - list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, - page->index, - readahead_gfp_mask(mapping))) { - args.page = page; - args.nr_pages = nr_pages - page_idx; - args.bio = do_mpage_readpage(&args); - } + args.page = page; + args.nr_pages = readahead_count(rac); + args.bio = do_mpage_readpage(&args); put_page(page); } - BUG_ON(!list_empty(pages)); if (args.bio) mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio); - return 0; } -EXPORT_SYMBOL(mpage_readpages); +EXPORT_SYMBOL(mpage_readahead); /* * This isn't called much at all @@ -563,7 +547,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, * Page has buffers, but they are all unmapped. The page was * created by pagein or read over a hole which was handled by * block_read_full_page(). If this address_space is also - * using mpage_readpages then this can rarely happen. + * using mpage_readahead then this can rarely happen. */ goto confused; } diff --git a/fs/namei.c b/fs/namei.c index a320371899cf..d81f73ff1a8b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3505,12 +3505,14 @@ EXPORT_SYMBOL(user_path_create); int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { + bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; int error = may_create(dir, dentry); if (error) return error; - if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) + if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout && + !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) @@ -4345,9 +4347,6 @@ static int do_renameat2(int olddfd, const char __user *oldname, int newdfd, (flags & RENAME_EXCHANGE)) return -EINVAL; - if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD)) - return -EPERM; - if (flags & RENAME_EXCHANGE) target_flags = 0; @@ -4483,20 +4482,6 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } -int vfs_whiteout(struct inode *dir, struct dentry *dentry) -{ - int error = may_create(dir, dentry); - if (error) - return error; - - if (!dir->i_op->mknod) - return -EPERM; - - return dir->i_op->mknod(dir, dentry, - S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); -} -EXPORT_SYMBOL(vfs_whiteout); - int readlink_copy(char __user *buffer, int buflen, const char *link) { int len = PTR_ERR(link); diff --git a/fs/namespace.c b/fs/namespace.c index a28e4db075ed..a6baee3c7904 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -648,6 +648,21 @@ struct vfsmount *lookup_mnt(const struct path *path) return m; } +static inline void lock_ns_list(struct mnt_namespace *ns) +{ + spin_lock(&ns->ns_lock); +} + +static inline void unlock_ns_list(struct mnt_namespace *ns) +{ + spin_unlock(&ns->ns_lock); +} + +static inline bool mnt_is_cursor(struct mount *mnt) +{ + return mnt->mnt.mnt_flags & MNT_CURSOR; +} + /* * __is_local_mountpoint - Test to see if dentry is a mountpoint in the * current mount namespace. @@ -673,11 +688,15 @@ bool __is_local_mountpoint(struct dentry *dentry) goto out; down_read(&namespace_sem); + lock_ns_list(ns); list_for_each_entry(mnt, &ns->list, mnt_list) { + if (mnt_is_cursor(mnt)) + continue; is_covered = (mnt->mnt_mountpoint == dentry); if (is_covered) break; } + unlock_ns_list(ns); up_read(&namespace_sem); out: return is_covered; @@ -1245,46 +1264,71 @@ struct vfsmount *mnt_clone_internal(const struct path *path) } #ifdef CONFIG_PROC_FS +static struct mount *mnt_list_next(struct mnt_namespace *ns, + struct list_head *p) +{ + struct mount *mnt, *ret = NULL; + + lock_ns_list(ns); + list_for_each_continue(p, &ns->list) { + mnt = list_entry(p, typeof(*mnt), mnt_list); + if (!mnt_is_cursor(mnt)) { + ret = mnt; + break; + } + } + unlock_ns_list(ns); + + return ret; +} + /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_mounts *p = m->private; + struct list_head *prev; down_read(&namespace_sem); - if (p->cached_event == p->ns->event) { - void *v = p->cached_mount; - if (*pos == p->cached_index) - return v; - if (*pos == p->cached_index + 1) { - v = seq_list_next(v, &p->ns->list, &p->cached_index); - return p->cached_mount = v; - } + if (!*pos) { + prev = &p->ns->list; + } else { + prev = &p->cursor.mnt_list; + + /* Read after we'd reached the end? */ + if (list_empty(prev)) + return NULL; } - p->cached_event = p->ns->event; - p->cached_mount = seq_list_start(&p->ns->list, *pos); - p->cached_index = *pos; - return p->cached_mount; + return mnt_list_next(p->ns, prev); } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { struct proc_mounts *p = m->private; + struct mount *mnt = v; - p->cached_mount = seq_list_next(v, &p->ns->list, pos); - p->cached_index = *pos; - return p->cached_mount; + ++*pos; + return mnt_list_next(p->ns, &mnt->mnt_list); } static void m_stop(struct seq_file *m, void *v) { + struct proc_mounts *p = m->private; + struct mount *mnt = v; + + lock_ns_list(p->ns); + if (mnt) + list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list); + else + list_del_init(&p->cursor.mnt_list); + unlock_ns_list(p->ns); up_read(&namespace_sem); } static int m_show(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; - struct mount *r = list_entry(v, struct mount, mnt_list); + struct mount *r = v; return p->show(m, &r->mnt); } @@ -1294,6 +1338,15 @@ const struct seq_operations mounts_op = { .stop = m_stop, .show = m_show, }; + +void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor) +{ + down_read(&namespace_sem); + lock_ns_list(ns); + list_del(&cursor->mnt_list); + unlock_ns_list(ns); + up_read(&namespace_sem); +} #endif /* CONFIG_PROC_FS */ /** @@ -3202,6 +3255,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a atomic_set(&new_ns->count, 1); INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); + spin_lock_init(&new_ns->ns_lock); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; return new_ns; @@ -3595,7 +3649,7 @@ EXPORT_SYMBOL(path_is_under); * file system may be mounted on put_old. After all, new_root is a mountpoint. * * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. - * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives + * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives * in this situation. * * Notes: @@ -3842,10 +3896,14 @@ static bool mnt_already_visible(struct mnt_namespace *ns, bool visible = false; down_read(&namespace_sem); + lock_ns_list(ns); list_for_each_entry(mnt, &ns->list, mnt_list) { struct mount *child; int mnt_flags; + if (mnt_is_cursor(mnt)) + continue; + if (mnt->mnt.mnt_sb->s_type != sb->s_type) continue; @@ -3893,6 +3951,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, next: ; } found: + unlock_ns_list(ns); up_read(&namespace_sem); return visible; } diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 7a57ff2528af..8f7cff7a4293 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -582,7 +582,7 @@ retry: if (!arg->layoutupdate_pages) return -ENOMEM; - start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL); + start_p = __vmalloc(buffer_size, GFP_NOFS); if (!start_p) { kfree(arg->layoutupdate_pages); return -ENOMEM; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 1f32a9fbfdaf..6673a77884d9 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -668,7 +668,8 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) } /* - * Record the page as unstable and mark its inode as dirty. + * Record the page as unstable (an extra writeback period) and mark its + * inode as dirty. */ static inline void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) @@ -676,8 +677,11 @@ void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) if (!cinfo->dreq) { struct inode *inode = page_file_mapping(page)->host; - inc_node_page_state(page, NR_UNSTABLE_NFS); - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); + /* This page is really still in write-back - just that the + * writeback is happening on the server now. + */ + inc_node_page_state(page, NR_WRITEBACK); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1e767f779c49..639c34fec04a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -946,9 +946,9 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, static void nfs_clear_page_commit(struct page *page) { - dec_node_page_state(page, NR_UNSTABLE_NFS); + dec_node_page_state(page, NR_WRITEBACK); dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, - WB_RECLAIMABLE); + WB_WRITEBACK); } /* Called holding the request lock on @req */ diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index a8fb18609146..9e40dfecf1b1 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -127,16 +127,8 @@ nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname) goto out; } - { - SHASH_DESC_ON_STACK(desc, tfm); - - desc->tfm = tfm; - - status = crypto_shash_digest(desc, clname->data, clname->len, - cksum.data); - shash_desc_zero(desc); - } - + status = crypto_shash_tfm_digest(tfm, clname->data, clname->len, + cksum.data); if (status) goto out; @@ -1148,7 +1140,6 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) struct crypto_shash *tfm = cn->cn_tfm; struct xdr_netobj cksum; char *principal = NULL; - SHASH_DESC_ON_STACK(desc, tfm); /* Don't upcall if it's already stored */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) @@ -1170,16 +1161,14 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) else if (clp->cl_cred.cr_principal) principal = clp->cl_cred.cr_principal; if (principal) { - desc->tfm = tfm; cksum.len = crypto_shash_digestsize(tfm); cksum.data = kmalloc(cksum.len, GFP_KERNEL); if (cksum.data == NULL) { ret = -ENOMEM; goto out; } - ret = crypto_shash_digest(desc, principal, strlen(principal), - cksum.data); - shash_desc_zero(desc); + ret = crypto_shash_tfm_digest(tfm, principal, strlen(principal), + cksum.data); if (ret) { kfree(cksum.data); goto out; @@ -1343,7 +1332,6 @@ nfsd4_cld_check_v2(struct nfs4_client *clp) struct crypto_shash *tfm = cn->cn_tfm; struct xdr_netobj cksum; char *principal = NULL; - SHASH_DESC_ON_STACK(desc, tfm); /* did we already find that this client is stable? */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) @@ -1381,14 +1369,12 @@ found: principal = clp->cl_cred.cr_principal; if (principal == NULL) return -ENOENT; - desc->tfm = tfm; cksum.len = crypto_shash_digestsize(tfm); cksum.data = kmalloc(cksum.len, GFP_KERNEL); if (cksum.data == NULL) return -ENOENT; - status = crypto_shash_digest(desc, principal, strlen(principal), - cksum.data); - shash_desc_zero(desc); + status = crypto_shash_tfm_digest(tfm, principal, + strlen(principal), cksum.data); if (status) { kfree(cksum.data); return -ENOENT; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 0aa02eb18bd3..c3fbab1753ec 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -979,12 +979,13 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) /* - * We want less throttling in balance_dirty_pages() - * and shrink_inactive_list() so that nfs to + * We want throttling in balance_dirty_pages() + * and shrink_inactive_list() to only consider + * the backingdev we are writing to, so that nfs to * localhost doesn't cause nfsd to lock up due to all * the client's dirty pages or its congested queue. */ - current->flags |= PF_LESS_THROTTLE; + current->flags |= PF_LOCAL_THROTTLE; exp = fhp->fh_export; use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); @@ -1037,7 +1038,7 @@ out_nfserr: nfserr = nfserrno(host_err); } if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) - current_restore_flags(pflags, PF_LESS_THROTTLE); + current_restore_flags(pflags, PF_LOCAL_THROTTLE); return nfserr; } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 671085512e0f..ceeb3b441844 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -145,18 +145,9 @@ static int nilfs_readpage(struct file *file, struct page *page) return mpage_readpage(page, nilfs_get_block); } -/** - * nilfs_readpages() - implement readpages() method of nilfs_aops {} - * address_space_operations. - * @file - file struct of the file to be read - * @mapping - address_space struct used for reading multiple pages - * @pages - the pages to be read - * @nr_pages - number of pages to be read - */ -static int nilfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void nilfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block); + mpage_readahead(rac, nilfs_get_block); } static int nilfs_writepages(struct address_space *mapping, @@ -308,7 +299,7 @@ const struct address_space_operations nilfs_aops = { .readpage = nilfs_readpage, .writepages = nilfs_writepages, .set_page_dirty = nilfs_set_page_dirty, - .readpages = nilfs_readpages, + .readahead = nilfs_readahead, .write_begin = nilfs_write_begin, .write_end = nilfs_write_end, /* .releasepage = nilfs_releasepage, */ diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 380a543c5b19..b55cdeb4d169 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -375,7 +375,7 @@ static inline int nilfs_flush_device(struct the_nilfs *nilfs) */ smp_wmb(); - err = blkdev_issue_flush(nilfs->ns_bdev, GFP_KERNEL, NULL); + err = blkdev_issue_flush(nilfs->ns_bdev, GFP_KERNEL); if (err != -EIO) err = 0; return err; diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 5435a40f82be..c18459cea6f4 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -520,7 +520,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC); BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 20); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19); mask = fanotify_group_event_mask(group, iter_info, mask, data, data_type); diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig index 6736e47d94d8..7715fadd5fff 100644 --- a/fs/notify/inotify/Kconfig +++ b/fs/notify/inotify/Kconfig @@ -12,6 +12,6 @@ config INOTIFY_USER new features including multiple file events, one-shot support, and unmount notification. - For more information, see <file:Documentation/filesystems/inotify.txt> + For more information, see <file:Documentation/filesystems/inotify.rst> If unsure, say Y. diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig index de9fb5cff226..1667a7e590d8 100644 --- a/fs/ntfs/Kconfig +++ b/fs/ntfs/Kconfig @@ -18,7 +18,7 @@ config NTFS_FS the Linux 2.4 kernel series is separately available as a patch from the project web site. - For more information see <file:Documentation/filesystems/ntfs.txt> + For more information see <file:Documentation/filesystems/ntfs.rst> and <http://www.linux-ntfs.org/>. To compile this file system support as a module, choose M here: the diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 554b744f41bf..bb0a43860ad2 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1732,7 +1732,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { bh = bh->b_this_page; } while (bh); tail->b_this_page = head; - attach_page_buffers(page, head); + attach_page_private(page, head); } else buffers_to_free = bh; } diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index 842b0bfc3ac9..7068425735f1 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h @@ -34,7 +34,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) /* return (void *)__get_free_page(gfp_mask); */ } if (likely((size >> PAGE_SHIFT) < totalram_pages())) - return __vmalloc(size, gfp_mask, PAGE_KERNEL); + return __vmalloc(size, gfp_mask); return NULL; } diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 3aac5c917afe..fbb9f1bc623d 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -504,7 +504,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, bh = bh->b_this_page; } while (bh); tail->b_this_page = head; - attach_page_buffers(page, head); + attach_page_private(page, head); } bh = head = page_buffers(page); BUG_ON(!bh); diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index 46bba20da6b5..1177c33df895 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -21,7 +21,7 @@ config OCFS2_FS OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ For more information on OCFS2, see the file - <file:Documentation/filesystems/ocfs2.txt>. + <file:Documentation/filesystems/ocfs2.rst>. config OCFS2_FS_O2CB tristate "O2CB Kernelspace Clustering" diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 3a67a6518ddf..3bfb4147895a 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -350,14 +350,11 @@ out: * grow out to a tree. If need be, detecting boundary extents could * trivially be added in a future version of ocfs2_get_block(). */ -static int ocfs2_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void ocfs2_readahead(struct readahead_control *rac) { - int ret, err = -EIO; - struct inode *inode = mapping->host; + int ret; + struct inode *inode = rac->mapping->host; struct ocfs2_inode_info *oi = OCFS2_I(inode); - loff_t start; - struct page *last; /* * Use the nonblocking flag for the dlm code to avoid page @@ -365,36 +362,31 @@ static int ocfs2_readpages(struct file *filp, struct address_space *mapping, */ ret = ocfs2_inode_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK); if (ret) - return err; + return; - if (down_read_trylock(&oi->ip_alloc_sem) == 0) { - ocfs2_inode_unlock(inode, 0); - return err; - } + if (down_read_trylock(&oi->ip_alloc_sem) == 0) + goto out_unlock; /* * Don't bother with inline-data. There isn't anything * to read-ahead in that case anyway... */ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) - goto out_unlock; + goto out_up; /* * Check whether a remote node truncated this file - we just * drop out in that case as it's not worth handling here. */ - last = lru_to_page(pages); - start = (loff_t)last->index << PAGE_SHIFT; - if (start >= i_size_read(inode)) - goto out_unlock; + if (readahead_pos(rac) >= i_size_read(inode)) + goto out_up; - err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block); + mpage_readahead(rac, ocfs2_get_block); -out_unlock: +out_up: up_read(&oi->ip_alloc_sem); +out_unlock: ocfs2_inode_unlock(inode, 0); - - return err; } /* Note: Because we don't support holes, our allocation has @@ -2474,7 +2466,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) const struct address_space_operations ocfs2_aops = { .readpage = ocfs2_readpage, - .readpages = ocfs2_readpages, + .readahead = ocfs2_readahead, .writepage = ocfs2_writepage, .write_begin = ocfs2_write_begin, .write_end = ocfs2_write_end, diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 55a6512e9fde..f105746063ed 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2760,6 +2760,7 @@ leave: * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) + __must_hold(&dlm->spinlock) { int ret; int lock_dropped = 0; diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 1de77f1a600b..ea868c6f9800 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -227,7 +227,7 @@ static ssize_t dlmfs_file_read(struct file *filp, loff_t *ppos) { int bytes_left; - ssize_t readlen, got; + ssize_t got; char *lvb_buf; struct inode *inode = file_inode(filp); @@ -237,36 +237,31 @@ static ssize_t dlmfs_file_read(struct file *filp, if (*ppos >= i_size_read(inode)) return 0; + /* don't read past the lvb */ + if (count > i_size_read(inode) - *ppos) + count = i_size_read(inode) - *ppos; + if (!count) return 0; - if (!access_ok(buf, count)) - return -EFAULT; - - /* don't read past the lvb */ - if ((count + *ppos) > i_size_read(inode)) - readlen = i_size_read(inode) - *ppos; - else - readlen = count; - - lvb_buf = kmalloc(readlen, GFP_NOFS); + lvb_buf = kmalloc(count, GFP_NOFS); if (!lvb_buf) return -ENOMEM; - got = user_dlm_read_lvb(inode, lvb_buf, readlen); + got = user_dlm_read_lvb(inode, lvb_buf, count); if (got) { - BUG_ON(got != readlen); - bytes_left = __copy_to_user(buf, lvb_buf, readlen); - readlen -= bytes_left; + BUG_ON(got != count); + bytes_left = copy_to_user(buf, lvb_buf, count); + count -= bytes_left; } else - readlen = 0; + count = 0; kfree(lvb_buf); - *ppos = *ppos + readlen; + *ppos = *ppos + count; - mlog(0, "read %zd bytes\n", readlen); - return readlen; + mlog(0, "read %zu bytes\n", count); + return count; } static ssize_t dlmfs_file_write(struct file *filp, @@ -291,9 +286,6 @@ static ssize_t dlmfs_file_write(struct file *filp, if (!count) return 0; - if (!access_ok(buf, count)) - return -EFAULT; - lvb_buf = kmalloc(count, GFP_NOFS); if (!lvb_buf) return -ENOMEM; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6cd5e4924e4d..85979e2214b3 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -194,7 +194,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, needs_barrier = true; err = jbd2_complete_transaction(journal, commit_tid); if (needs_barrier) { - ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); if (!err) err = ret; } diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 9150cfa4df7d..ee5d98516212 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -279,6 +279,7 @@ enum ocfs2_mount_options OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */ OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */ + OCFS2_MOUNT_NOCLUSTER = 1 << 18, /* No cluster aware filesystem mount */ }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -673,7 +674,8 @@ static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb) static inline int ocfs2_mount_local(struct ocfs2_super *osb) { - return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); + return ((osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT) + || (osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER)); } static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 8caeceeaeda7..4da0e4b1e79b 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -254,14 +254,16 @@ static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, int i, ret = -ENOSPC; if ((preferred >= 0) && (preferred < si->si_num_slots)) { - if (!si->si_slots[preferred].sl_valid) { + if (!si->si_slots[preferred].sl_valid || + !si->si_slots[preferred].sl_node_num) { ret = preferred; goto out; } } for(i = 0; i < si->si_num_slots; i++) { - if (!si->si_slots[i].sl_valid) { + if (!si->si_slots[i].sl_valid || + !si->si_slots[i].sl_node_num) { ret = i; break; } @@ -456,24 +458,30 @@ int ocfs2_find_slot(struct ocfs2_super *osb) spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); - /* search for ourselves first and take the slot if it already - * exists. Perhaps we need to mark this in a variable for our - * own journal recovery? Possibly not, though we certainly - * need to warn to the user */ - slot = __ocfs2_node_num_to_slot(si, osb->node_num); - if (slot < 0) { - /* if no slot yet, then just take 1st available - * one. */ - slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); + if (ocfs2_mount_local(osb)) + /* use slot 0 directly in local mode */ + slot = 0; + else { + /* search for ourselves first and take the slot if it already + * exists. Perhaps we need to mark this in a variable for our + * own journal recovery? Possibly not, though we certainly + * need to warn to the user */ + slot = __ocfs2_node_num_to_slot(si, osb->node_num); if (slot < 0) { - spin_unlock(&osb->osb_lock); - mlog(ML_ERROR, "no free slots available!\n"); - status = -EINVAL; - goto bail; - } - } else - printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already " - "allocated to this node!\n", slot, osb->dev_str); + /* if no slot yet, then just take 1st available + * one. */ + slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); + if (slot < 0) { + spin_unlock(&osb->osb_lock); + mlog(ML_ERROR, "no free slots available!\n"); + status = -EINVAL; + goto bail; + } + } else + printk(KERN_INFO "ocfs2: Slot %d on device (%s) was " + "already allocated to this node!\n", + slot, osb->dev_str); + } ocfs2_set_slot(si, slot, osb->node_num); osb->slot_num = slot; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index ac61eeaf3837..71ea9ce71a6b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -175,6 +175,7 @@ enum { Opt_dir_resv_level, Opt_journal_async_commit, Opt_err_cont, + Opt_nocluster, Opt_err, }; @@ -208,6 +209,7 @@ static const match_table_t tokens = { {Opt_dir_resv_level, "dir_resv_level=%u"}, {Opt_journal_async_commit, "journal_async_commit"}, {Opt_err_cont, "errors=continue"}, + {Opt_nocluster, "nocluster"}, {Opt_err, NULL} }; @@ -619,6 +621,13 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto out; } + tmp = OCFS2_MOUNT_NOCLUSTER; + if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { + ret = -EINVAL; + mlog(ML_ERROR, "Cannot change nocluster option on remount\n"); + goto out; + } + tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | OCFS2_MOUNT_HB_NONE; if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) { @@ -859,6 +868,7 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, } if (ocfs2_userspace_stack(osb) && + !(osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && strncmp(osb->osb_cluster_stack, mopt->cluster_stack, OCFS2_STACK_LABEL_LEN)) { mlog(ML_ERROR, @@ -1139,6 +1149,11 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : "ordered"); + if ((osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) && + !(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT)) + printk(KERN_NOTICE "ocfs2: The shared device (%s) is mounted " + "without cluster aware mode.\n", osb->dev_str); + atomic_set(&osb->vol_state, VOLUME_MOUNTED); wake_up(&osb->osb_mount_event); @@ -1445,6 +1460,9 @@ static int ocfs2_parse_options(struct super_block *sb, case Opt_journal_async_commit: mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT; break; + case Opt_nocluster: + mopt->mount_opt |= OCFS2_MOUNT_NOCLUSTER; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1556,6 +1574,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) seq_printf(s, ",journal_async_commit"); + if (opts & OCFS2_MOUNT_NOCLUSTER) + seq_printf(s, ",nocluster"); + return 0; } diff --git a/fs/omfs/file.c b/fs/omfs/file.c index d640b9388238..d7b5f09d298c 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -289,10 +289,9 @@ static int omfs_readpage(struct file *file, struct page *page) return block_read_full_page(page, omfs_get_block); } -static int omfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void omfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, omfs_get_block); + mpage_readahead(rac, omfs_get_block); } static int omfs_writepage(struct page *page, struct writeback_control *wbc) @@ -373,7 +372,7 @@ const struct inode_operations omfs_file_inops = { const struct address_space_operations omfs_aops = { .readpage = omfs_readpage, - .readpages = omfs_readpages, + .readahead = omfs_readahead, .writepage = omfs_writepage, .writepages = omfs_writepages, .write_begin = omfs_write_begin, diff --git a/fs/open.c b/fs/open.c index 719b320ede52..6cd48a61cda3 100644 --- a/fs/open.c +++ b/fs/open.c @@ -345,21 +345,14 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. */ -long do_faccessat(int dfd, const char __user *filename, int mode) +static const struct cred *access_override_creds(void) { const struct cred *old_cred; struct cred *override_cred; - struct path path; - struct inode *inode; - int res; - unsigned int lookup_flags = LOOKUP_FOLLOW; - - if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ - return -EINVAL; override_cred = prepare_creds(); if (!override_cred) - return -ENOMEM; + return NULL; override_cred->fsuid = override_cred->uid; override_cred->fsgid = override_cred->gid; @@ -394,6 +387,38 @@ long do_faccessat(int dfd, const char __user *filename, int mode) override_cred->non_rcu = 1; old_cred = override_creds(override_cred); + + /* override_cred() gets its own ref */ + put_cred(override_cred); + + return old_cred; +} + +long do_faccessat(int dfd, const char __user *filename, int mode, int flags) +{ + struct path path; + struct inode *inode; + int res; + unsigned int lookup_flags = LOOKUP_FOLLOW; + const struct cred *old_cred = NULL; + + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; + + if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) + return -EINVAL; + + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + if (!(flags & AT_EACCESS)) { + old_cred = access_override_creds(); + if (!old_cred) + return -ENOMEM; + } + retry: res = user_path_at(dfd, filename, lookup_flags, &path); if (res) @@ -435,19 +460,26 @@ out_path_release: goto retry; } out: - revert_creds(old_cred); - put_cred(override_cred); + if (old_cred) + revert_creds(old_cred); + return res; } SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) { - return do_faccessat(dfd, filename, mode); + return do_faccessat(dfd, filename, mode, 0); +} + +SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode, + int, flags) +{ + return do_faccessat(dfd, filename, mode, flags); } SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) { - return do_faccessat(AT_FDCWD, filename, mode); + return do_faccessat(AT_FDCWD, filename, mode, 0); } int ksys_chdir(const char __user *filename) @@ -743,9 +775,8 @@ static int do_dentry_open(struct file *f, path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; - - /* Ensure that we skip any errors that predate opening of the file */ f->f_wb_err = filemap_sample_wb_err(f->f_mapping); + f->f_sb_err = file_sample_sb_err(f); if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH | FMODE_OPENED; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 12ae630fbed7..48f0547d4850 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -62,12 +62,7 @@ static int orangefs_writepage_locked(struct page *page, } else { ret = 0; } - if (wr) { - kfree(wr); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); - } + kfree(detach_page_private(page)); return ret; } @@ -409,9 +404,7 @@ static int orangefs_write_begin(struct file *file, wr->len = len; wr->uid = current_fsuid(); wr->gid = current_fsgid(); - SetPagePrivate(page); - set_page_private(page, (unsigned long)wr); - get_page(page); + attach_page_private(page, wr); okay: return 0; } @@ -459,18 +452,12 @@ static void orangefs_invalidatepage(struct page *page, wr = (struct orangefs_write_range *)page_private(page); if (offset == 0 && length == PAGE_SIZE) { - kfree((struct orangefs_write_range *)page_private(page)); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); + kfree(detach_page_private(page)); return; /* write range entirely within invalidate range (or equal) */ } else if (page_offset(page) + offset <= wr->pos && wr->pos + wr->len <= page_offset(page) + offset + length) { - kfree((struct orangefs_write_range *)page_private(page)); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); + kfree(detach_page_private(page)); /* XXX is this right? only caller in fs */ cancel_dirty_page(page); return; @@ -535,12 +522,7 @@ static int orangefs_releasepage(struct page *page, gfp_t foo) static void orangefs_freepage(struct page *page) { - if (PagePrivate(page)) { - kfree((struct orangefs_write_range *)page_private(page)); - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); - } + kfree(detach_page_private(page)); } static int orangefs_launder_page(struct page *page) @@ -740,9 +722,7 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) wr->len = PAGE_SIZE; wr->uid = current_fsuid(); wr->gid = current_fsgid(); - SetPagePrivate(page); - set_page_private(page, (unsigned long)wr); - get_page(page); + attach_page_private(page, wr); okay: file_update_time(vmf->vma->vm_file); diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 714c14c47ca5..dd188c7996b3 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -9,7 +9,7 @@ config OVERLAY_FS 'lower' filesystem is either hidden or, in the case of directories, merged with the 'upper' object. - For more information see Documentation/filesystems/overlayfs.txt + For more information see Documentation/filesystems/overlayfs.rst config OVERLAY_FS_REDIRECT_DIR bool "Overlayfs: turn on redirect directory feature by default" @@ -38,7 +38,7 @@ config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW If backward compatibility is not an issue, then it is safe and recommended to say N here. - For more information, see Documentation/filesystems/overlayfs.txt + For more information, see Documentation/filesystems/overlayfs.rst If unsure, say Y. @@ -103,7 +103,7 @@ config OVERLAY_FS_XINO_AUTO If compatibility with applications that expect 32bit inodes is not an issue, then it is safe and recommended to say Y here. - For more information, see Documentation/filesystems/overlayfs.txt + For more information, see Documentation/filesystems/overlayfs.rst If unsure, say N. diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 27ef84d99f59..971a42f6357d 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -23,7 +23,7 @@ config PROC_FS /proc" or the equivalent line in /etc/fstab does the job. The /proc file system is explained in the file - <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage + <file:Documentation/filesystems/proc.rst> and on the proc(5) manpage ("man 5 proc"). This option will enlarge your kernel by about 67 KB. Several @@ -95,7 +95,7 @@ config PROC_CHILDREN default n help Provides a fast way to retrieve first level children pids of a task. See - <file:Documentation/filesystems/proc.txt> for more information. + <file:Documentation/filesystems/proc.rst> for more information. Say Y if you are running any user-space software which takes benefit from this interface. For example, rkt is such a piece of software. diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8c1f1bb1a5ce..ecc63ce01be7 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -103,11 +103,14 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SUnreclaim: ", sunreclaim); seq_printf(m, "KernelStack: %8lu kB\n", global_zone_page_state(NR_KERNEL_STACK_KB)); +#ifdef CONFIG_SHADOW_CALL_STACK + seq_printf(m, "ShadowCallStack:%8lu kB\n", + global_zone_page_state(NR_KERNEL_SCS_KB)); +#endif show_val_kb(m, "PageTables: ", global_zone_page_state(NR_PAGETABLE)); - show_val_kb(m, "NFS_Unstable: ", - global_node_page_state(NR_UNSTABLE_NFS)); + show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", global_zone_page_state(NR_BOUNCE)); show_val_kb(m, "WritebackTmp: ", diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8d382d4ec067..6ad407d5efe2 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -546,10 +546,17 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); - struct page *page; + struct page *page = NULL; - /* FOLL_DUMP will return -EFAULT on huge zero page */ - page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + if (pmd_present(*pmd)) { + /* FOLL_DUMP will return -EFAULT on huge zero page */ + page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { + swp_entry_t entry = pmd_to_swp_entry(*pmd); + + if (is_migration_entry(entry)) + page = migration_entry_to_page(entry); + } if (IS_ERR_OR_NULL(page)) return; if (PageAnon(page)) @@ -578,8 +585,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { - if (pmd_present(*pmd)) - smaps_pmd_entry(pmd, addr, walk); + smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); goto out; } @@ -622,9 +628,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_GROWSDOWN)] = "gd", [ilog2(VM_PFNMAP)] = "pf", [ilog2(VM_DENYWRITE)] = "dw", -#ifdef CONFIG_X86_INTEL_MPX - [ilog2(VM_MPX)] = "mp", -#endif [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", @@ -638,6 +641,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_ARCH_1)] = "ar", [ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_DONTDUMP)] = "dd", +#ifdef CONFIG_ARM64_BTI + [ilog2(VM_ARM64_BTI)] = "bt", +#endif #ifdef CONFIG_MEM_SOFT_DIRTY [ilog2(VM_SOFTDIRTY)] = "sd", #endif diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 273ee82d8aa9..e4d70c0dffe9 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -279,7 +279,8 @@ static int mounts_open_common(struct inode *inode, struct file *file, p->ns = ns; p->root = root; p->show = show; - p->cached_event = ~0ULL; + INIT_LIST_HEAD(&p->cursor.mnt_list); + p->cursor.mnt.mnt_flags = MNT_CURSOR; return 0; @@ -296,6 +297,7 @@ static int mounts_release(struct inode *inode, struct file *file) struct seq_file *m = file->private_data; struct proc_mounts *p = m->private; path_put(&p->root); + mnt_cursor_del(p->ns, &p->cursor); put_mnt_ns(p->ns); return seq_release_private(inode, file); } diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 8f0369aad22a..e16a49ebfe54 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -153,3 +153,112 @@ config PSTORE_RAM "ramoops.ko". For more information, see Documentation/admin-guide/ramoops.rst. + +config PSTORE_ZONE + tristate + depends on PSTORE + help + The common layer for pstore/blk (and pstore/ram in the future) + to manage storage in zones. + +config PSTORE_BLK + tristate "Log panic/oops to a block device" + depends on PSTORE + depends on BLOCK + select PSTORE_ZONE + default n + help + This enables panic and oops message to be logged to a block dev + where it can be read back at some later point. + + For more information, see Documentation/admin-guide/pstore-blk.rst + + If unsure, say N. + +config PSTORE_BLK_BLKDEV + string "block device identifier" + depends on PSTORE_BLK + default "" + help + Which block device should be used for pstore/blk. + + It accepts the following variants: + 1) <hex_major><hex_minor> device number in hexadecimal representation, + with no leading 0x, for example b302. + 2) /dev/<disk_name> represents the device name of disk + 3) /dev/<disk_name><decimal> represents the device name and number + of partition - device number of disk plus the partition number + 4) /dev/<disk_name>p<decimal> - same as the above, this form is + used when disk name of partitioned disk ends with a digit. + 5) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the + unique id of a partition if the partition table provides it. + The UUID may be either an EFI/GPT UUID, or refer to an MSDOS + partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- + filled hex representation of the 32-bit "NT disk signature", and PP + is a zero-filled hex representation of the 1-based partition number. + 6) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation + to a partition with a known unique id. + 7) <major>:<minor> major and minor number of the device separated by + a colon. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_KMSG_SIZE + int "Size in Kbytes of kmsg dump log to store" + depends on PSTORE_BLK + default 64 + help + This just sets size of kmsg dump (oops, panic, etc) log for + pstore/blk. The size is in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_MAX_REASON + int "Maximum kmsg dump reason to store" + depends on PSTORE_BLK + default 2 + help + The maximum reason for kmsg dumps to store. The default is + 2 (KMSG_DUMP_OOPS), see include/linux/kmsg_dump.h's + enum kmsg_dump_reason for more details. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_PMSG_SIZE + int "Size in Kbytes of pmsg to store" + depends on PSTORE_BLK + depends on PSTORE_PMSG + default 64 + help + This just sets size of pmsg (pmsg_size) for pstore/blk. The size is + in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_CONSOLE_SIZE + int "Size in Kbytes of console log to store" + depends on PSTORE_BLK + depends on PSTORE_CONSOLE + default 64 + help + This just sets size of console log (console_size) to store via + pstore/blk. The size is in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_FTRACE_SIZE + int "Size in Kbytes of ftrace log to store" + depends on PSTORE_BLK + depends on PSTORE_FTRACE + default 64 + help + This just sets size of ftrace log (ftrace_size) for pstore/blk. The + size is in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile index 967b5891f325..c270467aeece 100644 --- a/fs/pstore/Makefile +++ b/fs/pstore/Makefile @@ -12,3 +12,9 @@ pstore-$(CONFIG_PSTORE_PMSG) += pmsg.o ramoops-objs += ram.o ram_core.o obj-$(CONFIG_PSTORE_RAM) += ramoops.o + +pstore_zone-objs += zone.o +obj-$(CONFIG_PSTORE_ZONE) += pstore_zone.o + +pstore_blk-objs += blk.o +obj-$(CONFIG_PSTORE_BLK) += pstore_blk.o diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c new file mode 100644 index 000000000000..fcd5563dde06 --- /dev/null +++ b/fs/pstore/blk.c @@ -0,0 +1,517 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implements pstore backend driver that write to block (or non-block) storage + * devices, using the pstore/zone API. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include "../../block/blk.h" +#include <linux/blkdev.h> +#include <linux/string.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/platform_device.h> +#include <linux/pstore_blk.h> +#include <linux/mount.h> +#include <linux/uio.h> + +static long kmsg_size = CONFIG_PSTORE_BLK_KMSG_SIZE; +module_param(kmsg_size, long, 0400); +MODULE_PARM_DESC(kmsg_size, "kmsg dump record size in kbytes"); + +static int max_reason = CONFIG_PSTORE_BLK_MAX_REASON; +module_param(max_reason, int, 0400); +MODULE_PARM_DESC(max_reason, + "maximum reason for kmsg dump (default 2: Oops and Panic)"); + +#if IS_ENABLED(CONFIG_PSTORE_PMSG) +static long pmsg_size = CONFIG_PSTORE_BLK_PMSG_SIZE; +#else +static long pmsg_size = -1; +#endif +module_param(pmsg_size, long, 0400); +MODULE_PARM_DESC(pmsg_size, "pmsg size in kbytes"); + +#if IS_ENABLED(CONFIG_PSTORE_CONSOLE) +static long console_size = CONFIG_PSTORE_BLK_CONSOLE_SIZE; +#else +static long console_size = -1; +#endif +module_param(console_size, long, 0400); +MODULE_PARM_DESC(console_size, "console size in kbytes"); + +#if IS_ENABLED(CONFIG_PSTORE_FTRACE) +static long ftrace_size = CONFIG_PSTORE_BLK_FTRACE_SIZE; +#else +static long ftrace_size = -1; +#endif +module_param(ftrace_size, long, 0400); +MODULE_PARM_DESC(ftrace_size, "ftrace size in kbytes"); + +static bool best_effort; +module_param(best_effort, bool, 0400); +MODULE_PARM_DESC(best_effort, "use best effort to write (i.e. do not require storage driver pstore support, default: off)"); + +/* + * blkdev - the block device to use for pstore storage + * + * Usually, this will be a partition of a block device. + * + * blkdev accepts the following variants: + * 1) <hex_major><hex_minor> device number in hexadecimal representation, + * with no leading 0x, for example b302. + * 2) /dev/<disk_name> represents the device number of disk + * 3) /dev/<disk_name><decimal> represents the device number + * of partition - device number of disk plus the partition number + * 4) /dev/<disk_name>p<decimal> - same as the above, that form is + * used when disk name of partitioned disk ends on a digit. + * 5) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the + * unique id of a partition if the partition table provides it. + * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS + * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- + * filled hex representation of the 32-bit "NT disk signature", and PP + * is a zero-filled hex representation of the 1-based partition number. + * 6) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to + * a partition with a known unique id. + * 7) <major>:<minor> major and minor number of the device separated by + * a colon. + */ +static char blkdev[80] = CONFIG_PSTORE_BLK_BLKDEV; +module_param_string(blkdev, blkdev, 80, 0400); +MODULE_PARM_DESC(blkdev, "block device for pstore storage"); + +/* + * All globals must only be accessed under the pstore_blk_lock + * during the register/unregister functions. + */ +static DEFINE_MUTEX(pstore_blk_lock); +static struct block_device *psblk_bdev; +static struct pstore_zone_info *pstore_zone_info; +static pstore_blk_panic_write_op blkdev_panic_write; + +struct bdev_info { + dev_t devt; + sector_t nr_sects; + sector_t start_sect; +}; + +#define check_size(name, alignsize) ({ \ + long _##name_ = (name); \ + _##name_ = _##name_ <= 0 ? 0 : (_##name_ * 1024); \ + if (_##name_ & ((alignsize) - 1)) { \ + pr_info(#name " must align to %d\n", \ + (alignsize)); \ + _##name_ = ALIGN(name, (alignsize)); \ + } \ + _##name_; \ +}) + +static int __register_pstore_device(struct pstore_device_info *dev) +{ + int ret; + + lockdep_assert_held(&pstore_blk_lock); + + if (!dev || !dev->total_size || !dev->read || !dev->write) + return -EINVAL; + + /* someone already registered before */ + if (pstore_zone_info) + return -EBUSY; + + pstore_zone_info = kzalloc(sizeof(struct pstore_zone_info), GFP_KERNEL); + if (!pstore_zone_info) + return -ENOMEM; + + /* zero means not limit on which backends to attempt to store. */ + if (!dev->flags) + dev->flags = UINT_MAX; + +#define verify_size(name, alignsize, enabled) { \ + long _##name_; \ + if (enabled) \ + _##name_ = check_size(name, alignsize); \ + else \ + _##name_ = 0; \ + name = _##name_ / 1024; \ + pstore_zone_info->name = _##name_; \ + } + + verify_size(kmsg_size, 4096, dev->flags & PSTORE_FLAGS_DMESG); + verify_size(pmsg_size, 4096, dev->flags & PSTORE_FLAGS_PMSG); + verify_size(console_size, 4096, dev->flags & PSTORE_FLAGS_CONSOLE); + verify_size(ftrace_size, 4096, dev->flags & PSTORE_FLAGS_FTRACE); +#undef verify_size + + pstore_zone_info->total_size = dev->total_size; + pstore_zone_info->max_reason = max_reason; + pstore_zone_info->read = dev->read; + pstore_zone_info->write = dev->write; + pstore_zone_info->erase = dev->erase; + pstore_zone_info->panic_write = dev->panic_write; + pstore_zone_info->name = KBUILD_MODNAME; + pstore_zone_info->owner = THIS_MODULE; + + ret = register_pstore_zone(pstore_zone_info); + if (ret) { + kfree(pstore_zone_info); + pstore_zone_info = NULL; + } + return ret; +} +/** + * register_pstore_device() - register non-block device to pstore/blk + * + * @dev: non-block device information + * + * Return: + * * 0 - OK + * * Others - something error. + */ +int register_pstore_device(struct pstore_device_info *dev) +{ + int ret; + + mutex_lock(&pstore_blk_lock); + ret = __register_pstore_device(dev); + mutex_unlock(&pstore_blk_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(register_pstore_device); + +static void __unregister_pstore_device(struct pstore_device_info *dev) +{ + lockdep_assert_held(&pstore_blk_lock); + if (pstore_zone_info && pstore_zone_info->read == dev->read) { + unregister_pstore_zone(pstore_zone_info); + kfree(pstore_zone_info); + pstore_zone_info = NULL; + } +} + +/** + * unregister_pstore_device() - unregister non-block device from pstore/blk + * + * @dev: non-block device information + */ +void unregister_pstore_device(struct pstore_device_info *dev) +{ + mutex_lock(&pstore_blk_lock); + __unregister_pstore_device(dev); + mutex_unlock(&pstore_blk_lock); +} +EXPORT_SYMBOL_GPL(unregister_pstore_device); + +/** + * psblk_get_bdev() - open block device + * + * @holder: Exclusive holder identifier + * @info: Information about bdev to fill in + * + * Return: pointer to block device on success and others on error. + * + * On success, the returned block_device has reference count of one. + */ +static struct block_device *psblk_get_bdev(void *holder, + struct bdev_info *info) +{ + struct block_device *bdev = ERR_PTR(-ENODEV); + fmode_t mode = FMODE_READ | FMODE_WRITE; + sector_t nr_sects; + + lockdep_assert_held(&pstore_blk_lock); + + if (pstore_zone_info) + return ERR_PTR(-EBUSY); + + if (!blkdev[0]) + return ERR_PTR(-ENODEV); + + if (holder) + mode |= FMODE_EXCL; + bdev = blkdev_get_by_path(blkdev, mode, holder); + if (IS_ERR(bdev)) { + dev_t devt; + + devt = name_to_dev_t(blkdev); + if (devt == 0) + return ERR_PTR(-ENODEV); + bdev = blkdev_get_by_dev(devt, mode, holder); + if (IS_ERR(bdev)) + return bdev; + } + + nr_sects = part_nr_sects_read(bdev->bd_part); + if (!nr_sects) { + pr_err("not enough space for '%s'\n", blkdev); + blkdev_put(bdev, mode); + return ERR_PTR(-ENOSPC); + } + + if (info) { + info->devt = bdev->bd_dev; + info->nr_sects = nr_sects; + info->start_sect = get_start_sect(bdev); + } + + return bdev; +} + +static void psblk_put_bdev(struct block_device *bdev, void *holder) +{ + fmode_t mode = FMODE_READ | FMODE_WRITE; + + lockdep_assert_held(&pstore_blk_lock); + + if (!bdev) + return; + + if (holder) + mode |= FMODE_EXCL; + blkdev_put(bdev, mode); +} + +static ssize_t psblk_generic_blk_read(char *buf, size_t bytes, loff_t pos) +{ + struct block_device *bdev = psblk_bdev; + struct file file; + struct kiocb kiocb; + struct iov_iter iter; + struct kvec iov = {.iov_base = buf, .iov_len = bytes}; + + if (!bdev) + return -ENODEV; + + memset(&file, 0, sizeof(struct file)); + file.f_mapping = bdev->bd_inode->i_mapping; + file.f_flags = O_DSYNC | __O_SYNC | O_NOATIME; + file.f_inode = bdev->bd_inode; + file_ra_state_init(&file.f_ra, file.f_mapping); + + init_sync_kiocb(&kiocb, &file); + kiocb.ki_pos = pos; + iov_iter_kvec(&iter, READ, &iov, 1, bytes); + + return generic_file_read_iter(&kiocb, &iter); +} + +static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes, + loff_t pos) +{ + struct block_device *bdev = psblk_bdev; + struct iov_iter iter; + struct kiocb kiocb; + struct file file; + ssize_t ret; + struct kvec iov = {.iov_base = (void *)buf, .iov_len = bytes}; + + if (!bdev) + return -ENODEV; + + /* Console/Ftrace backend may handle buffer until flush dirty zones */ + if (in_interrupt() || irqs_disabled()) + return -EBUSY; + + memset(&file, 0, sizeof(struct file)); + file.f_mapping = bdev->bd_inode->i_mapping; + file.f_flags = O_DSYNC | __O_SYNC | O_NOATIME; + file.f_inode = bdev->bd_inode; + + init_sync_kiocb(&kiocb, &file); + kiocb.ki_pos = pos; + iov_iter_kvec(&iter, WRITE, &iov, 1, bytes); + + inode_lock(bdev->bd_inode); + ret = generic_write_checks(&kiocb, &iter); + if (ret > 0) + ret = generic_perform_write(&file, &iter, pos); + inode_unlock(bdev->bd_inode); + + if (likely(ret > 0)) { + const struct file_operations f_op = {.fsync = blkdev_fsync}; + + file.f_op = &f_op; + kiocb.ki_pos += ret; + ret = generic_write_sync(&kiocb, ret); + } + return ret; +} + +static ssize_t psblk_blk_panic_write(const char *buf, size_t size, + loff_t off) +{ + int ret; + + if (!blkdev_panic_write) + return -EOPNOTSUPP; + + /* size and off must align to SECTOR_SIZE for block device */ + ret = blkdev_panic_write(buf, off >> SECTOR_SHIFT, + size >> SECTOR_SHIFT); + /* try next zone */ + if (ret == -ENOMSG) + return ret; + return ret ? -EIO : size; +} + +static int __register_pstore_blk(struct pstore_blk_info *info) +{ + char bdev_name[BDEVNAME_SIZE]; + struct block_device *bdev; + struct pstore_device_info dev; + struct bdev_info binfo; + void *holder = blkdev; + int ret = -ENODEV; + + lockdep_assert_held(&pstore_blk_lock); + + /* hold bdev exclusively */ + memset(&binfo, 0, sizeof(binfo)); + bdev = psblk_get_bdev(holder, &binfo); + if (IS_ERR(bdev)) { + pr_err("failed to open '%s'!\n", blkdev); + return PTR_ERR(bdev); + } + + /* only allow driver matching the @blkdev */ + if (!binfo.devt || (!best_effort && + MAJOR(binfo.devt) != info->major)) { + pr_debug("invalid major %u (expect %u)\n", + info->major, MAJOR(binfo.devt)); + ret = -ENODEV; + goto err_put_bdev; + } + + /* psblk_bdev must be assigned before register to pstore/blk */ + psblk_bdev = bdev; + blkdev_panic_write = info->panic_write; + + /* Copy back block device details. */ + info->devt = binfo.devt; + info->nr_sects = binfo.nr_sects; + info->start_sect = binfo.start_sect; + + memset(&dev, 0, sizeof(dev)); + dev.total_size = info->nr_sects << SECTOR_SHIFT; + dev.flags = info->flags; + dev.read = psblk_generic_blk_read; + dev.write = psblk_generic_blk_write; + dev.erase = NULL; + dev.panic_write = info->panic_write ? psblk_blk_panic_write : NULL; + + ret = __register_pstore_device(&dev); + if (ret) + goto err_put_bdev; + + bdevname(bdev, bdev_name); + pr_info("attached %s%s\n", bdev_name, + info->panic_write ? "" : " (no dedicated panic_write!)"); + return 0; + +err_put_bdev: + psblk_bdev = NULL; + blkdev_panic_write = NULL; + psblk_put_bdev(bdev, holder); + return ret; +} + +/** + * register_pstore_blk() - register block device to pstore/blk + * + * @info: details on the desired block device interface + * + * Return: + * * 0 - OK + * * Others - something error. + */ +int register_pstore_blk(struct pstore_blk_info *info) +{ + int ret; + + mutex_lock(&pstore_blk_lock); + ret = __register_pstore_blk(info); + mutex_unlock(&pstore_blk_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(register_pstore_blk); + +static void __unregister_pstore_blk(unsigned int major) +{ + struct pstore_device_info dev = { .read = psblk_generic_blk_read }; + void *holder = blkdev; + + lockdep_assert_held(&pstore_blk_lock); + if (psblk_bdev && MAJOR(psblk_bdev->bd_dev) == major) { + __unregister_pstore_device(&dev); + psblk_put_bdev(psblk_bdev, holder); + blkdev_panic_write = NULL; + psblk_bdev = NULL; + } +} + +/** + * unregister_pstore_blk() - unregister block device from pstore/blk + * + * @major: the major device number of device + */ +void unregister_pstore_blk(unsigned int major) +{ + mutex_lock(&pstore_blk_lock); + __unregister_pstore_blk(major); + mutex_unlock(&pstore_blk_lock); +} +EXPORT_SYMBOL_GPL(unregister_pstore_blk); + +/* get information of pstore/blk */ +int pstore_blk_get_config(struct pstore_blk_config *info) +{ + strncpy(info->device, blkdev, 80); + info->max_reason = max_reason; + info->kmsg_size = check_size(kmsg_size, 4096); + info->pmsg_size = check_size(pmsg_size, 4096); + info->ftrace_size = check_size(ftrace_size, 4096); + info->console_size = check_size(console_size, 4096); + + return 0; +} +EXPORT_SYMBOL_GPL(pstore_blk_get_config); + +static int __init pstore_blk_init(void) +{ + struct pstore_blk_info info = { }; + int ret = 0; + + mutex_lock(&pstore_blk_lock); + if (!pstore_zone_info && best_effort && blkdev[0]) + ret = __register_pstore_blk(&info); + mutex_unlock(&pstore_blk_lock); + + return ret; +} +late_initcall(pstore_blk_init); + +static void __exit pstore_blk_exit(void) +{ + mutex_lock(&pstore_blk_lock); + if (psblk_bdev) + __unregister_pstore_blk(MAJOR(psblk_bdev->bd_dev)); + else { + struct pstore_device_info dev = { }; + + if (pstore_zone_info) + dev.read = pstore_zone_info->read; + __unregister_pstore_device(&dev); + } + mutex_unlock(&pstore_blk_lock); +} +module_exit(pstore_blk_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("WeiXiong Liao <liaoweixiong@allwinnertech.com>"); +MODULE_AUTHOR("Kees Cook <keescook@chromium.org>"); +MODULE_DESCRIPTION("pstore backend for block devices"); diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index bfbfc2698070..5c0450701293 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -16,6 +16,7 @@ #include <linux/debugfs.h> #include <linux/err.h> #include <linux/cache.h> +#include <linux/slab.h> #include <asm/barrier.h> #include "internal.h" @@ -132,3 +133,56 @@ void pstore_unregister_ftrace(void) debugfs_remove_recursive(pstore_ftrace_dir); } + +ssize_t pstore_ftrace_combine_log(char **dest_log, size_t *dest_log_size, + const char *src_log, size_t src_log_size) +{ + size_t dest_size, src_size, total, dest_off, src_off; + size_t dest_idx = 0, src_idx = 0, merged_idx = 0; + void *merged_buf; + struct pstore_ftrace_record *drec, *srec, *mrec; + size_t record_size = sizeof(struct pstore_ftrace_record); + + dest_off = *dest_log_size % record_size; + dest_size = *dest_log_size - dest_off; + + src_off = src_log_size % record_size; + src_size = src_log_size - src_off; + + total = dest_size + src_size; + merged_buf = kmalloc(total, GFP_KERNEL); + if (!merged_buf) + return -ENOMEM; + + drec = (struct pstore_ftrace_record *)(*dest_log + dest_off); + srec = (struct pstore_ftrace_record *)(src_log + src_off); + mrec = (struct pstore_ftrace_record *)(merged_buf); + + while (dest_size > 0 && src_size > 0) { + if (pstore_ftrace_read_timestamp(&drec[dest_idx]) < + pstore_ftrace_read_timestamp(&srec[src_idx])) { + mrec[merged_idx++] = drec[dest_idx++]; + dest_size -= record_size; + } else { + mrec[merged_idx++] = srec[src_idx++]; + src_size -= record_size; + } + } + + while (dest_size > 0) { + mrec[merged_idx++] = drec[dest_idx++]; + dest_size -= record_size; + } + + while (src_size > 0) { + mrec[merged_idx++] = srec[src_idx++]; + src_size -= record_size; + } + + kfree(*dest_log); + *dest_log = merged_buf; + *dest_log_size = total; + + return 0; +} +EXPORT_SYMBOL_GPL(pstore_ftrace_combine_log); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index d99b5d39aa90..c331efe8de95 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -22,18 +22,21 @@ #include <linux/magic.h> #include <linux/pstore.h> #include <linux/slab.h> -#include <linux/spinlock.h> #include <linux/uaccess.h> #include "internal.h" #define PSTORE_NAMELEN 64 -static DEFINE_SPINLOCK(allpstore_lock); -static LIST_HEAD(allpstore); +static DEFINE_MUTEX(records_list_lock); +static LIST_HEAD(records_list); + +static DEFINE_MUTEX(pstore_sb_lock); +static struct super_block *pstore_sb; struct pstore_private { struct list_head list; + struct dentry *dentry; struct pstore_record *record; size_t total_size; }; @@ -178,10 +181,22 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) { struct pstore_private *p = d_inode(dentry)->i_private; struct pstore_record *record = p->record; + int rc = 0; if (!record->psi->erase) return -EPERM; + /* Make sure we can't race while removing this file. */ + mutex_lock(&records_list_lock); + if (!list_empty(&p->list)) + list_del_init(&p->list); + else + rc = -ENOENT; + p->dentry = NULL; + mutex_unlock(&records_list_lock); + if (rc) + return rc; + mutex_lock(&record->psi->read_mutex); record->psi->erase(record); mutex_unlock(&record->psi->read_mutex); @@ -192,15 +207,9 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) static void pstore_evict_inode(struct inode *inode) { struct pstore_private *p = inode->i_private; - unsigned long flags; clear_inode(inode); - if (p) { - spin_lock_irqsave(&allpstore_lock, flags); - list_del(&p->list); - spin_unlock_irqrestore(&allpstore_lock, flags); - free_pstore_private(p); - } + free_pstore_private(p); } static const struct inode_operations pstore_dir_inode_operations = { @@ -278,11 +287,54 @@ static const struct super_operations pstore_ops = { .show_options = pstore_show_options, }; -static struct super_block *pstore_sb; +static struct dentry *psinfo_lock_root(void) +{ + struct dentry *root; -bool pstore_is_mounted(void) + mutex_lock(&pstore_sb_lock); + /* + * Having no backend is fine -- no records appear. + * Not being mounted is fine -- nothing to do. + */ + if (!psinfo || !pstore_sb) { + mutex_unlock(&pstore_sb_lock); + return NULL; + } + + root = pstore_sb->s_root; + inode_lock(d_inode(root)); + mutex_unlock(&pstore_sb_lock); + + return root; +} + +int pstore_put_backend_records(struct pstore_info *psi) { - return pstore_sb != NULL; + struct pstore_private *pos, *tmp; + struct dentry *root; + int rc = 0; + + root = psinfo_lock_root(); + if (!root) + return 0; + + mutex_lock(&records_list_lock); + list_for_each_entry_safe(pos, tmp, &records_list, list) { + if (pos->record->psi == psi) { + list_del_init(&pos->list); + rc = simple_unlink(d_inode(root), pos->dentry); + if (WARN_ON(rc)) + break; + d_drop(pos->dentry); + dput(pos->dentry); + pos->dentry = NULL; + } + } + mutex_unlock(&records_list_lock); + + inode_unlock(d_inode(root)); + + return rc; } /* @@ -297,23 +349,20 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) int rc = 0; char name[PSTORE_NAMELEN]; struct pstore_private *private, *pos; - unsigned long flags; size_t size = record->size + record->ecc_notice_size; - WARN_ON(!inode_is_locked(d_inode(root))); + if (WARN_ON(!inode_is_locked(d_inode(root)))) + return -EINVAL; - spin_lock_irqsave(&allpstore_lock, flags); - list_for_each_entry(pos, &allpstore, list) { + rc = -EEXIST; + /* Skip records that are already present in the filesystem. */ + mutex_lock(&records_list_lock); + list_for_each_entry(pos, &records_list, list) { if (pos->record->type == record->type && pos->record->id == record->id && - pos->record->psi == record->psi) { - rc = -EEXIST; - break; - } + pos->record->psi == record->psi) + goto fail; } - spin_unlock_irqrestore(&allpstore_lock, flags); - if (rc) - return rc; rc = -ENOMEM; inode = pstore_get_inode(root->d_sb); @@ -334,6 +383,7 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) if (!dentry) goto fail_private; + private->dentry = dentry; private->record = record; inode->i_size = private->total_size = size; inode->i_private = private; @@ -343,9 +393,8 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) d_add(dentry, inode); - spin_lock_irqsave(&allpstore_lock, flags); - list_add(&private->list, &allpstore); - spin_unlock_irqrestore(&allpstore_lock, flags); + list_add(&private->list, &records_list); + mutex_unlock(&records_list_lock); return 0; @@ -353,8 +402,8 @@ fail_private: free_pstore_private(private); fail_inode: iput(inode); - fail: + mutex_unlock(&records_list_lock); return rc; } @@ -366,16 +415,13 @@ fail: */ void pstore_get_records(int quiet) { - struct pstore_info *psi = psinfo; struct dentry *root; - if (!psi || !pstore_sb) + root = psinfo_lock_root(); + if (!root) return; - root = pstore_sb->s_root; - - inode_lock(d_inode(root)); - pstore_get_backend_records(psi, root, quiet); + pstore_get_backend_records(psinfo, root, quiet); inode_unlock(d_inode(root)); } @@ -383,8 +429,6 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; - pstore_sb = sb; - sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; @@ -405,6 +449,10 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) if (!sb->s_root) return -ENOMEM; + mutex_lock(&pstore_sb_lock); + pstore_sb = sb; + mutex_unlock(&pstore_sb_lock); + pstore_get_records(0); return 0; @@ -418,8 +466,17 @@ static struct dentry *pstore_mount(struct file_system_type *fs_type, static void pstore_kill_sb(struct super_block *sb) { + mutex_lock(&pstore_sb_lock); + WARN_ON(pstore_sb != sb); + kill_litter_super(sb); pstore_sb = NULL; + + mutex_lock(&records_list_lock); + INIT_LIST_HEAD(&records_list); + mutex_unlock(&records_list_lock); + + mutex_unlock(&pstore_sb_lock); } static struct file_system_type pstore_fs_type = { diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 7062ea4bc57c..7fb219042f13 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -12,9 +12,18 @@ extern unsigned long kmsg_bytes; #ifdef CONFIG_PSTORE_FTRACE extern void pstore_register_ftrace(void); extern void pstore_unregister_ftrace(void); +ssize_t pstore_ftrace_combine_log(char **dest_log, size_t *dest_log_size, + const char *src_log, size_t src_log_size); #else static inline void pstore_register_ftrace(void) {} static inline void pstore_unregister_ftrace(void) {} +static inline ssize_t +pstore_ftrace_combine_log(char **dest_log, size_t *dest_log_size, + const char *src_log, size_t src_log_size) +{ + *dest_log_size = 0; + return 0; +} #endif #ifdef CONFIG_PSTORE_PMSG @@ -31,9 +40,9 @@ extern void pstore_set_kmsg_bytes(int); extern void pstore_get_records(int); extern void pstore_get_backend_records(struct pstore_info *psi, struct dentry *root, int quiet); +extern int pstore_put_backend_records(struct pstore_info *psi); extern int pstore_mkfile(struct dentry *root, struct pstore_record *record); -extern bool pstore_is_mounted(void); extern void pstore_record_init(struct pstore_record *record, struct pstore_info *psi); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 408277ee3cdb..a9e297eefdff 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -44,7 +44,7 @@ static int pstore_update_ms = -1; module_param_named(update_ms, pstore_update_ms, int, 0600); MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content " "(default is -1, which means runtime updates are disabled; " - "enabling this option is not safe, it may lead to further " + "enabling this option may not be safe; it may lead to further " "corruption on Oopses)"); /* Names should be in the same order as the enum pstore_type_id */ @@ -69,19 +69,25 @@ static void pstore_dowork(struct work_struct *); static DECLARE_WORK(pstore_work, pstore_dowork); /* - * pstore_lock just protects "psinfo" during - * calls to pstore_register() + * psinfo_lock protects "psinfo" during calls to + * pstore_register(), pstore_unregister(), and + * the filesystem mount/unmount routines. */ -static DEFINE_SPINLOCK(pstore_lock); +static DEFINE_MUTEX(psinfo_lock); struct pstore_info *psinfo; static char *backend; +module_param(backend, charp, 0444); +MODULE_PARM_DESC(backend, "specific backend to use"); + static char *compress = #ifdef CONFIG_PSTORE_COMPRESS_DEFAULT CONFIG_PSTORE_COMPRESS_DEFAULT; #else NULL; #endif +module_param(compress, charp, 0444); +MODULE_PARM_DESC(compress, "compression to use"); /* Compression parameters */ static struct crypto_comp *tfm; @@ -129,24 +135,12 @@ enum pstore_type_id pstore_name_to_type(const char *name) } EXPORT_SYMBOL_GPL(pstore_name_to_type); -static const char *get_reason_str(enum kmsg_dump_reason reason) +static void pstore_timer_kick(void) { - switch (reason) { - case KMSG_DUMP_PANIC: - return "Panic"; - case KMSG_DUMP_OOPS: - return "Oops"; - case KMSG_DUMP_EMERG: - return "Emergency"; - case KMSG_DUMP_RESTART: - return "Restart"; - case KMSG_DUMP_HALT: - return "Halt"; - case KMSG_DUMP_POWEROFF: - return "Poweroff"; - default: - return "Unknown"; - } + if (pstore_update_ms < 0) + return; + + mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms)); } /* @@ -393,7 +387,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned int part = 1; int ret; - why = get_reason_str(reason); + why = kmsg_dump_reason_str(reason); if (down_trylock(&psinfo->buf_lock)) { /* Failed to acquire lock: give up if we cannot wait. */ @@ -459,8 +453,10 @@ static void pstore_dump(struct kmsg_dumper *dumper, } ret = psinfo->write(&record); - if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) + if (ret == 0 && reason == KMSG_DUMP_OOPS) { pstore_new_entry = 1; + pstore_timer_kick(); + } total += record.size; part++; @@ -503,14 +499,20 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) } static struct console pstore_console = { - .name = "pstore", .write = pstore_console_write, - .flags = CON_PRINTBUFFER | CON_ENABLED | CON_ANYTIME, .index = -1, }; static void pstore_register_console(void) { + /* Show which backend is going to get console writes. */ + strscpy(pstore_console.name, psinfo->name, + sizeof(pstore_console.name)); + /* + * Always initialize flags here since prior unregister_console() + * calls may have changed settings (specifically CON_ENABLED). + */ + pstore_console.flags = CON_PRINTBUFFER | CON_ENABLED | CON_ANYTIME; register_console(&pstore_console); } @@ -555,8 +557,6 @@ out: */ int pstore_register(struct pstore_info *psi) { - struct module *owner = psi->owner; - if (backend && strcmp(backend, psi->name)) { pr_warn("ignoring unexpected backend '%s'\n", psi->name); return -EPERM; @@ -576,11 +576,11 @@ int pstore_register(struct pstore_info *psi) return -EINVAL; } - spin_lock(&pstore_lock); + mutex_lock(&psinfo_lock); if (psinfo) { pr_warn("backend '%s' already loaded: ignoring '%s'\n", psinfo->name, psi->name); - spin_unlock(&pstore_lock); + mutex_unlock(&psinfo_lock); return -EBUSY; } @@ -589,21 +589,16 @@ int pstore_register(struct pstore_info *psi) psinfo = psi; mutex_init(&psinfo->read_mutex); sema_init(&psinfo->buf_lock, 1); - spin_unlock(&pstore_lock); - - if (owner && !try_module_get(owner)) { - psinfo = NULL; - return -EINVAL; - } if (psi->flags & PSTORE_FLAGS_DMESG) allocate_buf_for_compression(); - if (pstore_is_mounted()) - pstore_get_records(0); + pstore_get_records(0); - if (psi->flags & PSTORE_FLAGS_DMESG) + if (psi->flags & PSTORE_FLAGS_DMESG) { + pstore_dumper.max_reason = psinfo->max_reason; pstore_register_kmsg(); + } if (psi->flags & PSTORE_FLAGS_CONSOLE) pstore_register_console(); if (psi->flags & PSTORE_FLAGS_FTRACE) @@ -612,33 +607,36 @@ int pstore_register(struct pstore_info *psi) pstore_register_pmsg(); /* Start watching for new records, if desired. */ - if (pstore_update_ms >= 0) { - pstore_timer.expires = jiffies + - msecs_to_jiffies(pstore_update_ms); - add_timer(&pstore_timer); - } + pstore_timer_kick(); /* * Update the module parameter backend, so it is visible * through /sys/module/pstore/parameters/backend */ - backend = psi->name; + backend = kstrdup(psi->name, GFP_KERNEL); pr_info("Registered %s as persistent store backend\n", psi->name); - module_put(owner); - + mutex_unlock(&psinfo_lock); return 0; } EXPORT_SYMBOL_GPL(pstore_register); void pstore_unregister(struct pstore_info *psi) { - /* Stop timer and make sure all work has finished. */ - pstore_update_ms = -1; - del_timer_sync(&pstore_timer); - flush_work(&pstore_work); + /* It's okay to unregister nothing. */ + if (!psi) + return; + + mutex_lock(&psinfo_lock); + + /* Only one backend can be registered at a time. */ + if (WARN_ON(psi != psinfo)) { + mutex_unlock(&psinfo_lock); + return; + } + /* Unregister all callbacks. */ if (psi->flags & PSTORE_FLAGS_PMSG) pstore_unregister_pmsg(); if (psi->flags & PSTORE_FLAGS_FTRACE) @@ -648,10 +646,19 @@ void pstore_unregister(struct pstore_info *psi) if (psi->flags & PSTORE_FLAGS_DMESG) pstore_unregister_kmsg(); + /* Stop timer and make sure all work has finished. */ + del_timer_sync(&pstore_timer); + flush_work(&pstore_work); + + /* Remove all backend records from filesystem tree. */ + pstore_put_backend_records(psi); + free_buf_for_compression(); psinfo = NULL; + kfree(backend); backend = NULL; + mutex_unlock(&psinfo_lock); } EXPORT_SYMBOL_GPL(pstore_unregister); @@ -788,9 +795,7 @@ static void pstore_timefunc(struct timer_list *unused) schedule_work(&pstore_work); } - if (pstore_update_ms >= 0) - mod_timer(&pstore_timer, - jiffies + msecs_to_jiffies(pstore_update_ms)); + pstore_timer_kick(); } static void __init pstore_choose_compression(void) @@ -835,11 +840,5 @@ static void __exit pstore_exit(void) } module_exit(pstore_exit) -module_param(compress, charp, 0444); -MODULE_PARM_DESC(compress, "Pstore compression to use"); - -module_param(backend, charp, 0444); -MODULE_PARM_DESC(backend, "Pstore backend to use"); - MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>"); MODULE_LICENSE("GPL"); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 795622190c01..ca6d8a867285 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -21,6 +21,7 @@ #include <linux/pstore_ram.h> #include <linux/of.h> #include <linux/of_address.h> +#include "internal.h" #define RAMOOPS_KERNMSG_HDR "====" #define MIN_MEM_SIZE 4096UL @@ -53,22 +54,27 @@ MODULE_PARM_DESC(mem_size, "size of reserved RAM used to store oops/panic logs"); static unsigned int mem_type; -module_param(mem_type, uint, 0600); +module_param(mem_type, uint, 0400); MODULE_PARM_DESC(mem_type, "set to 1 to try to use unbuffered memory (default 0)"); -static int dump_oops = 1; -module_param(dump_oops, int, 0600); -MODULE_PARM_DESC(dump_oops, - "set to 1 to dump oopses, 0 to only dump panics (default 1)"); +static int ramoops_max_reason = -1; +module_param_named(max_reason, ramoops_max_reason, int, 0400); +MODULE_PARM_DESC(max_reason, + "maximum reason for kmsg dump (default 2: Oops and Panic) "); static int ramoops_ecc; -module_param_named(ecc, ramoops_ecc, int, 0600); +module_param_named(ecc, ramoops_ecc, int, 0400); MODULE_PARM_DESC(ramoops_ecc, "if non-zero, the option enables ECC support and specifies " "ECC buffer size in bytes (1 is a special value, means 16 " "bytes ECC)"); +static int ramoops_dump_oops = -1; +module_param_named(dump_oops, ramoops_dump_oops, int, 0400); +MODULE_PARM_DESC(dump_oops, + "(deprecated: use max_reason instead) set to 1 to dump oopses & panics, 0 to only dump panics"); + struct ramoops_context { struct persistent_ram_zone **dprzs; /* Oops dump zones */ struct persistent_ram_zone *cprz; /* Console zone */ @@ -81,7 +87,6 @@ struct ramoops_context { size_t console_size; size_t ftrace_size; size_t pmsg_size; - int dump_oops; u32 flags; struct persistent_ram_ecc_info ecc_info; unsigned int max_dump_cnt; @@ -168,58 +173,6 @@ static bool prz_ok(struct persistent_ram_zone *prz) persistent_ram_ecc_string(prz, NULL, 0)); } -static ssize_t ftrace_log_combine(struct persistent_ram_zone *dest, - struct persistent_ram_zone *src) -{ - size_t dest_size, src_size, total, dest_off, src_off; - size_t dest_idx = 0, src_idx = 0, merged_idx = 0; - void *merged_buf; - struct pstore_ftrace_record *drec, *srec, *mrec; - size_t record_size = sizeof(struct pstore_ftrace_record); - - dest_off = dest->old_log_size % record_size; - dest_size = dest->old_log_size - dest_off; - - src_off = src->old_log_size % record_size; - src_size = src->old_log_size - src_off; - - total = dest_size + src_size; - merged_buf = kmalloc(total, GFP_KERNEL); - if (!merged_buf) - return -ENOMEM; - - drec = (struct pstore_ftrace_record *)(dest->old_log + dest_off); - srec = (struct pstore_ftrace_record *)(src->old_log + src_off); - mrec = (struct pstore_ftrace_record *)(merged_buf); - - while (dest_size > 0 && src_size > 0) { - if (pstore_ftrace_read_timestamp(&drec[dest_idx]) < - pstore_ftrace_read_timestamp(&srec[src_idx])) { - mrec[merged_idx++] = drec[dest_idx++]; - dest_size -= record_size; - } else { - mrec[merged_idx++] = srec[src_idx++]; - src_size -= record_size; - } - } - - while (dest_size > 0) { - mrec[merged_idx++] = drec[dest_idx++]; - dest_size -= record_size; - } - - while (src_size > 0) { - mrec[merged_idx++] = srec[src_idx++]; - src_size -= record_size; - } - - kfree(dest->old_log); - dest->old_log = merged_buf; - dest->old_log_size = total; - - return 0; -} - static ssize_t ramoops_pstore_read(struct pstore_record *record) { ssize_t size = 0; @@ -291,7 +244,12 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record) tmp_prz->corrected_bytes += prz_next->corrected_bytes; tmp_prz->bad_blocks += prz_next->bad_blocks; - size = ftrace_log_combine(tmp_prz, prz_next); + + size = pstore_ftrace_combine_log( + &tmp_prz->old_log, + &tmp_prz->old_log_size, + prz_next->old_log, + prz_next->old_log_size); if (size) goto out; } @@ -382,16 +340,14 @@ static int notrace ramoops_pstore_write(struct pstore_record *record) return -EINVAL; /* - * Out of the various dmesg dump types, ramoops is currently designed - * to only store crash logs, rather than storing general kernel logs. + * We could filter on record->reason here if we wanted to (which + * would duplicate what happened before the "max_reason" setting + * was added), but that would defeat the purpose of a system + * changing printk.always_kmsg_dump, so instead log everything that + * the kmsg dumper sends us, since it should be doing the filtering + * based on the combination of printk.always_kmsg_dump and our + * requested "max_reason". */ - if (record->reason != KMSG_DUMP_OOPS && - record->reason != KMSG_DUMP_PANIC) - return -EINVAL; - - /* Skip Oopes when configured to do so. */ - if (record->reason == KMSG_DUMP_OOPS && !cxt->dump_oops) - return -EINVAL; /* * Explicitly only take the first part of any new crash. @@ -644,19 +600,25 @@ static int ramoops_init_prz(const char *name, return 0; } -static int ramoops_parse_dt_size(struct platform_device *pdev, - const char *propname, u32 *value) +/* Read a u32 from a dt property and make sure it's safe for an int. */ +static int ramoops_parse_dt_u32(struct platform_device *pdev, + const char *propname, + u32 default_value, u32 *value) { u32 val32 = 0; int ret; ret = of_property_read_u32(pdev->dev.of_node, propname, &val32); - if (ret < 0 && ret != -EINVAL) { + if (ret == -EINVAL) { + /* field is missing, use default value. */ + val32 = default_value; + } else if (ret < 0) { dev_err(&pdev->dev, "failed to parse property %s: %d\n", propname, ret); return ret; } + /* Sanity check our results. */ if (val32 > INT_MAX) { dev_err(&pdev->dev, "%s %u > INT_MAX\n", propname, val32); return -EOVERFLOW; @@ -687,23 +649,32 @@ static int ramoops_parse_dt(struct platform_device *pdev, pdata->mem_size = resource_size(res); pdata->mem_address = res->start; pdata->mem_type = of_property_read_bool(of_node, "unbuffered"); - pdata->dump_oops = !of_property_read_bool(of_node, "no-dump-oops"); - -#define parse_size(name, field) { \ - ret = ramoops_parse_dt_size(pdev, name, &value); \ + /* + * Setting "no-dump-oops" is deprecated and will be ignored if + * "max_reason" is also specified. + */ + if (of_property_read_bool(of_node, "no-dump-oops")) + pdata->max_reason = KMSG_DUMP_PANIC; + else + pdata->max_reason = KMSG_DUMP_OOPS; + +#define parse_u32(name, field, default_value) { \ + ret = ramoops_parse_dt_u32(pdev, name, default_value, \ + &value); \ if (ret < 0) \ return ret; \ field = value; \ } - parse_size("record-size", pdata->record_size); - parse_size("console-size", pdata->console_size); - parse_size("ftrace-size", pdata->ftrace_size); - parse_size("pmsg-size", pdata->pmsg_size); - parse_size("ecc-size", pdata->ecc_info.ecc_size); - parse_size("flags", pdata->flags); + parse_u32("record-size", pdata->record_size, 0); + parse_u32("console-size", pdata->console_size, 0); + parse_u32("ftrace-size", pdata->ftrace_size, 0); + parse_u32("pmsg-size", pdata->pmsg_size, 0); + parse_u32("ecc-size", pdata->ecc_info.ecc_size, 0); + parse_u32("flags", pdata->flags, 0); + parse_u32("max-reason", pdata->max_reason, pdata->max_reason); -#undef parse_size +#undef parse_u32 /* * Some old Chromebooks relied on the kernel setting the @@ -785,7 +756,6 @@ static int ramoops_probe(struct platform_device *pdev) cxt->console_size = pdata->console_size; cxt->ftrace_size = pdata->ftrace_size; cxt->pmsg_size = pdata->pmsg_size; - cxt->dump_oops = pdata->dump_oops; cxt->flags = pdata->flags; cxt->ecc_info = pdata->ecc_info; @@ -828,8 +798,10 @@ static int ramoops_probe(struct platform_device *pdev) * the single region size is how to check. */ cxt->pstore.flags = 0; - if (cxt->max_dump_cnt) + if (cxt->max_dump_cnt) { cxt->pstore.flags |= PSTORE_FLAGS_DMESG; + cxt->pstore.max_reason = pdata->max_reason; + } if (cxt->console_size) cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE; if (cxt->max_ftrace_cnt) @@ -865,7 +837,7 @@ static int ramoops_probe(struct platform_device *pdev) mem_size = pdata->mem_size; mem_address = pdata->mem_address; record_size = pdata->record_size; - dump_oops = pdata->dump_oops; + ramoops_max_reason = pdata->max_reason; ramoops_console_size = pdata->console_size; ramoops_pmsg_size = pdata->pmsg_size; ramoops_ftrace_size = pdata->ftrace_size; @@ -948,7 +920,16 @@ static void __init ramoops_register_dummy(void) pdata.console_size = ramoops_console_size; pdata.ftrace_size = ramoops_ftrace_size; pdata.pmsg_size = ramoops_pmsg_size; - pdata.dump_oops = dump_oops; + /* If "max_reason" is set, its value has priority over "dump_oops". */ + if (ramoops_max_reason >= 0) + pdata.max_reason = ramoops_max_reason; + /* Otherwise, if "dump_oops" is set, parse it into "max_reason". */ + else if (ramoops_dump_oops != -1) + pdata.max_reason = ramoops_dump_oops ? KMSG_DUMP_OOPS + : KMSG_DUMP_PANIC; + /* And if neither are explicitly set, use the default. */ + else + pdata.max_reason = KMSG_DUMP_OOPS; pdata.flags = RAMOOPS_FLAG_FTRACE_PER_CPU; /* diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index c917c191e78c..aa8e0b65ff1a 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -283,7 +283,7 @@ static int notrace persistent_ram_update_user(struct persistent_ram_zone *prz, const void __user *s, unsigned int start, unsigned int count) { struct persistent_ram_buffer *buffer = prz->buffer; - int ret = unlikely(__copy_from_user(buffer->data + start, s, count)) ? + int ret = unlikely(copy_from_user(buffer->data + start, s, count)) ? -EFAULT : 0; persistent_ram_update_ecc(prz, start, count); return ret; @@ -348,8 +348,6 @@ int notrace persistent_ram_write_user(struct persistent_ram_zone *prz, int rem, ret = 0, c = count; size_t start; - if (unlikely(!access_ok(s, count))) - return -EFAULT; if (unlikely(c > prz->buffer_size)) { s += c - prz->buffer_size; c = prz->buffer_size; diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c new file mode 100644 index 000000000000..819428dfa32f --- /dev/null +++ b/fs/pstore/zone.c @@ -0,0 +1,1465 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide a pstore intermediate backend, organized into kernel memory + * allocated zones that are then mapped and flushed into a single + * contiguous region on a storage backend of some kind (block, mtd, etc). + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/mount.h> +#include <linux/printk.h> +#include <linux/fs.h> +#include <linux/pstore_zone.h> +#include <linux/kdev_t.h> +#include <linux/device.h> +#include <linux/namei.h> +#include <linux/fcntl.h> +#include <linux/uio.h> +#include <linux/writeback.h> +#include "internal.h" + +/** + * struct psz_head - header of zone to flush to storage + * + * @sig: signature to indicate header (PSZ_SIG xor PSZONE-type value) + * @datalen: length of data in @data + * @start: offset into @data where the beginning of the stored bytes begin + * @data: zone data. + */ +struct psz_buffer { +#define PSZ_SIG (0x43474244) /* DBGC */ + uint32_t sig; + atomic_t datalen; + atomic_t start; + uint8_t data[]; +}; + +/** + * struct psz_kmsg_header - kmsg dump-specific header to flush to storage + * + * @magic: magic num for kmsg dump header + * @time: kmsg dump trigger time + * @compressed: whether conpressed + * @counter: kmsg dump counter + * @reason: the kmsg dump reason (e.g. oops, panic, etc) + * @data: pointer to log data + * + * This is a sub-header for a kmsg dump, trailing after &psz_buffer. + */ +struct psz_kmsg_header { +#define PSTORE_KMSG_HEADER_MAGIC 0x4dfc3ae5 /* Just a random number */ + uint32_t magic; + struct timespec64 time; + bool compressed; + uint32_t counter; + enum kmsg_dump_reason reason; + uint8_t data[]; +}; + +/** + * struct pstore_zone - single stored buffer + * + * @off: zone offset of storage + * @type: front-end type for this zone + * @name: front-end name for this zone + * @buffer: pointer to data buffer managed by this zone + * @oldbuf: pointer to old data buffer + * @buffer_size: bytes in @buffer->data + * @should_recover: whether this zone should recover from storage + * @dirty: whether the data in @buffer dirty + * + * zone structure in memory. + */ +struct pstore_zone { + loff_t off; + const char *name; + enum pstore_type_id type; + + struct psz_buffer *buffer; + struct psz_buffer *oldbuf; + size_t buffer_size; + bool should_recover; + atomic_t dirty; +}; + +/** + * struct psz_context - all about running state of pstore/zone + * + * @kpszs: kmsg dump storage zones + * @ppsz: pmsg storage zone + * @cpsz: console storage zone + * @fpszs: ftrace storage zones + * @kmsg_max_cnt: max count of @kpszs + * @kmsg_read_cnt: counter of total read kmsg dumps + * @kmsg_write_cnt: counter of total kmsg dump writes + * @pmsg_read_cnt: counter of total read pmsg zone + * @console_read_cnt: counter of total read console zone + * @ftrace_max_cnt: max count of @fpszs + * @ftrace_read_cnt: counter of max read ftrace zone + * @oops_counter: counter of oops dumps + * @panic_counter: counter of panic dumps + * @recovered: whether finished recovering data from storage + * @on_panic: whether panic is happening + * @pstore_zone_info_lock: lock to @pstore_zone_info + * @pstore_zone_info: information from backend + * @pstore: structure for pstore + */ +struct psz_context { + struct pstore_zone **kpszs; + struct pstore_zone *ppsz; + struct pstore_zone *cpsz; + struct pstore_zone **fpszs; + unsigned int kmsg_max_cnt; + unsigned int kmsg_read_cnt; + unsigned int kmsg_write_cnt; + unsigned int pmsg_read_cnt; + unsigned int console_read_cnt; + unsigned int ftrace_max_cnt; + unsigned int ftrace_read_cnt; + /* + * These counters should be calculated during recovery. + * It records the oops/panic times after crashes rather than boots. + */ + unsigned int oops_counter; + unsigned int panic_counter; + atomic_t recovered; + atomic_t on_panic; + + /* + * pstore_zone_info_lock protects this entire structure during calls + * to register_pstore_zone()/unregister_pstore_zone(). + */ + struct mutex pstore_zone_info_lock; + struct pstore_zone_info *pstore_zone_info; + struct pstore_info pstore; +}; +static struct psz_context pstore_zone_cxt; + +static void psz_flush_all_dirty_zones(struct work_struct *); +static DECLARE_DELAYED_WORK(psz_cleaner, psz_flush_all_dirty_zones); + +/** + * enum psz_flush_mode - flush mode for psz_zone_write() + * + * @FLUSH_NONE: do not flush to storage but update data on memory + * @FLUSH_PART: just flush part of data including meta data to storage + * @FLUSH_META: just flush meta data of zone to storage + * @FLUSH_ALL: flush all of zone + */ +enum psz_flush_mode { + FLUSH_NONE = 0, + FLUSH_PART, + FLUSH_META, + FLUSH_ALL, +}; + +static inline int buffer_datalen(struct pstore_zone *zone) +{ + return atomic_read(&zone->buffer->datalen); +} + +static inline int buffer_start(struct pstore_zone *zone) +{ + return atomic_read(&zone->buffer->start); +} + +static inline bool is_on_panic(void) +{ + return atomic_read(&pstore_zone_cxt.on_panic); +} + +static ssize_t psz_zone_read_buffer(struct pstore_zone *zone, char *buf, + size_t len, unsigned long off) +{ + if (!buf || !zone || !zone->buffer) + return -EINVAL; + if (off > zone->buffer_size) + return -EINVAL; + len = min_t(size_t, len, zone->buffer_size - off); + memcpy(buf, zone->buffer->data + off, len); + return len; +} + +static int psz_zone_read_oldbuf(struct pstore_zone *zone, char *buf, + size_t len, unsigned long off) +{ + if (!buf || !zone || !zone->oldbuf) + return -EINVAL; + if (off > zone->buffer_size) + return -EINVAL; + len = min_t(size_t, len, zone->buffer_size - off); + memcpy(buf, zone->oldbuf->data + off, len); + return 0; +} + +static int psz_zone_write(struct pstore_zone *zone, + enum psz_flush_mode flush_mode, const char *buf, + size_t len, unsigned long off) +{ + struct pstore_zone_info *info = pstore_zone_cxt.pstore_zone_info; + ssize_t wcnt = 0; + ssize_t (*writeop)(const char *buf, size_t bytes, loff_t pos); + size_t wlen; + + if (off > zone->buffer_size) + return -EINVAL; + + wlen = min_t(size_t, len, zone->buffer_size - off); + if (buf && wlen) { + memcpy(zone->buffer->data + off, buf, wlen); + atomic_set(&zone->buffer->datalen, wlen + off); + } + + /* avoid to damage old records */ + if (!is_on_panic() && !atomic_read(&pstore_zone_cxt.recovered)) + goto dirty; + + writeop = is_on_panic() ? info->panic_write : info->write; + if (!writeop) + goto dirty; + + switch (flush_mode) { + case FLUSH_NONE: + if (unlikely(buf && wlen)) + goto dirty; + return 0; + case FLUSH_PART: + wcnt = writeop((const char *)zone->buffer->data + off, wlen, + zone->off + sizeof(*zone->buffer) + off); + if (wcnt != wlen) + goto dirty; + fallthrough; + case FLUSH_META: + wlen = sizeof(struct psz_buffer); + wcnt = writeop((const char *)zone->buffer, wlen, zone->off); + if (wcnt != wlen) + goto dirty; + break; + case FLUSH_ALL: + wlen = zone->buffer_size + sizeof(*zone->buffer); + wcnt = writeop((const char *)zone->buffer, wlen, zone->off); + if (wcnt != wlen) + goto dirty; + break; + } + + return 0; +dirty: + /* no need to mark dirty if going to try next zone */ + if (wcnt == -ENOMSG) + return -ENOMSG; + atomic_set(&zone->dirty, true); + /* flush dirty zones nicely */ + if (wcnt == -EBUSY && !is_on_panic()) + schedule_delayed_work(&psz_cleaner, msecs_to_jiffies(500)); + return -EBUSY; +} + +static int psz_flush_dirty_zone(struct pstore_zone *zone) +{ + int ret; + + if (unlikely(!zone)) + return -EINVAL; + + if (unlikely(!atomic_read(&pstore_zone_cxt.recovered))) + return -EBUSY; + + if (!atomic_xchg(&zone->dirty, false)) + return 0; + + ret = psz_zone_write(zone, FLUSH_ALL, NULL, 0, 0); + if (ret) + atomic_set(&zone->dirty, true); + return ret; +} + +static int psz_flush_dirty_zones(struct pstore_zone **zones, unsigned int cnt) +{ + int i, ret; + struct pstore_zone *zone; + + if (!zones) + return -EINVAL; + + for (i = 0; i < cnt; i++) { + zone = zones[i]; + if (!zone) + return -EINVAL; + ret = psz_flush_dirty_zone(zone); + if (ret) + return ret; + } + return 0; +} + +static int psz_move_zone(struct pstore_zone *old, struct pstore_zone *new) +{ + const char *data = (const char *)old->buffer->data; + int ret; + + ret = psz_zone_write(new, FLUSH_ALL, data, buffer_datalen(old), 0); + if (ret) { + atomic_set(&new->buffer->datalen, 0); + atomic_set(&new->dirty, false); + return ret; + } + atomic_set(&old->buffer->datalen, 0); + return 0; +} + +static void psz_flush_all_dirty_zones(struct work_struct *work) +{ + struct psz_context *cxt = &pstore_zone_cxt; + int ret = 0; + + if (cxt->ppsz) + ret |= psz_flush_dirty_zone(cxt->ppsz); + if (cxt->cpsz) + ret |= psz_flush_dirty_zone(cxt->cpsz); + if (cxt->kpszs) + ret |= psz_flush_dirty_zones(cxt->kpszs, cxt->kmsg_max_cnt); + if (cxt->fpszs) + ret |= psz_flush_dirty_zones(cxt->fpszs, cxt->ftrace_max_cnt); + if (ret && cxt->pstore_zone_info) + schedule_delayed_work(&psz_cleaner, msecs_to_jiffies(1000)); +} + +static int psz_kmsg_recover_data(struct psz_context *cxt) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + struct pstore_zone *zone = NULL; + struct psz_buffer *buf; + unsigned long i; + ssize_t rcnt; + + if (!info->read) + return -EINVAL; + + for (i = 0; i < cxt->kmsg_max_cnt; i++) { + zone = cxt->kpszs[i]; + if (unlikely(!zone)) + return -EINVAL; + if (atomic_read(&zone->dirty)) { + unsigned int wcnt = cxt->kmsg_write_cnt; + struct pstore_zone *new = cxt->kpszs[wcnt]; + int ret; + + ret = psz_move_zone(zone, new); + if (ret) { + pr_err("move zone from %lu to %d failed\n", + i, wcnt); + return ret; + } + cxt->kmsg_write_cnt = (wcnt + 1) % cxt->kmsg_max_cnt; + } + if (!zone->should_recover) + continue; + buf = zone->buffer; + rcnt = info->read((char *)buf, zone->buffer_size + sizeof(*buf), + zone->off); + if (rcnt != zone->buffer_size + sizeof(*buf)) + return (int)rcnt < 0 ? (int)rcnt : -EIO; + } + return 0; +} + +static int psz_kmsg_recover_meta(struct psz_context *cxt) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + struct pstore_zone *zone; + size_t rcnt, len; + struct psz_buffer *buf; + struct psz_kmsg_header *hdr; + struct timespec64 time = { }; + unsigned long i; + /* + * Recover may on panic, we can't allocate any memory by kmalloc. + * So, we use local array instead. + */ + char buffer_header[sizeof(*buf) + sizeof(*hdr)] = {0}; + + if (!info->read) + return -EINVAL; + + len = sizeof(*buf) + sizeof(*hdr); + buf = (struct psz_buffer *)buffer_header; + for (i = 0; i < cxt->kmsg_max_cnt; i++) { + zone = cxt->kpszs[i]; + if (unlikely(!zone)) + return -EINVAL; + + rcnt = info->read((char *)buf, len, zone->off); + if (rcnt == -ENOMSG) { + pr_debug("%s with id %lu may be broken, skip\n", + zone->name, i); + continue; + } else if (rcnt != len) { + pr_err("read %s with id %lu failed\n", zone->name, i); + return (int)rcnt < 0 ? (int)rcnt : -EIO; + } + + if (buf->sig != zone->buffer->sig) { + pr_debug("no valid data in kmsg dump zone %lu\n", i); + continue; + } + + if (zone->buffer_size < atomic_read(&buf->datalen)) { + pr_info("found overtop zone: %s: id %lu, off %lld, size %zu\n", + zone->name, i, zone->off, + zone->buffer_size); + continue; + } + + hdr = (struct psz_kmsg_header *)buf->data; + if (hdr->magic != PSTORE_KMSG_HEADER_MAGIC) { + pr_info("found invalid zone: %s: id %lu, off %lld, size %zu\n", + zone->name, i, zone->off, + zone->buffer_size); + continue; + } + + /* + * we get the newest zone, and the next one must be the oldest + * or unused zone, because we do write one by one like a circle. + */ + if (hdr->time.tv_sec >= time.tv_sec) { + time.tv_sec = hdr->time.tv_sec; + cxt->kmsg_write_cnt = (i + 1) % cxt->kmsg_max_cnt; + } + + if (hdr->reason == KMSG_DUMP_OOPS) + cxt->oops_counter = + max(cxt->oops_counter, hdr->counter); + else if (hdr->reason == KMSG_DUMP_PANIC) + cxt->panic_counter = + max(cxt->panic_counter, hdr->counter); + + if (!atomic_read(&buf->datalen)) { + pr_debug("found erased zone: %s: id %lu, off %lld, size %zu, datalen %d\n", + zone->name, i, zone->off, + zone->buffer_size, + atomic_read(&buf->datalen)); + continue; + } + + if (!is_on_panic()) + zone->should_recover = true; + pr_debug("found nice zone: %s: id %lu, off %lld, size %zu, datalen %d\n", + zone->name, i, zone->off, + zone->buffer_size, atomic_read(&buf->datalen)); + } + + return 0; +} + +static int psz_kmsg_recover(struct psz_context *cxt) +{ + int ret; + + if (!cxt->kpszs) + return 0; + + ret = psz_kmsg_recover_meta(cxt); + if (ret) + goto recover_fail; + + ret = psz_kmsg_recover_data(cxt); + if (ret) + goto recover_fail; + + return 0; +recover_fail: + pr_debug("psz_recover_kmsg failed\n"); + return ret; +} + +static int psz_recover_zone(struct psz_context *cxt, struct pstore_zone *zone) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + struct psz_buffer *oldbuf, tmpbuf; + int ret = 0; + char *buf; + ssize_t rcnt, len, start, off; + + if (!zone || zone->oldbuf) + return 0; + + if (is_on_panic()) { + /* save data as much as possible */ + psz_flush_dirty_zone(zone); + return 0; + } + + if (unlikely(!info->read)) + return -EINVAL; + + len = sizeof(struct psz_buffer); + rcnt = info->read((char *)&tmpbuf, len, zone->off); + if (rcnt != len) { + pr_debug("read zone %s failed\n", zone->name); + return (int)rcnt < 0 ? (int)rcnt : -EIO; + } + + if (tmpbuf.sig != zone->buffer->sig) { + pr_debug("no valid data in zone %s\n", zone->name); + return 0; + } + + if (zone->buffer_size < atomic_read(&tmpbuf.datalen) || + zone->buffer_size < atomic_read(&tmpbuf.start)) { + pr_info("found overtop zone: %s: off %lld, size %zu\n", + zone->name, zone->off, zone->buffer_size); + /* just keep going */ + return 0; + } + + if (!atomic_read(&tmpbuf.datalen)) { + pr_debug("found erased zone: %s: off %lld, size %zu, datalen %d\n", + zone->name, zone->off, zone->buffer_size, + atomic_read(&tmpbuf.datalen)); + return 0; + } + + pr_debug("found nice zone: %s: off %lld, size %zu, datalen %d\n", + zone->name, zone->off, zone->buffer_size, + atomic_read(&tmpbuf.datalen)); + + len = atomic_read(&tmpbuf.datalen) + sizeof(*oldbuf); + oldbuf = kzalloc(len, GFP_KERNEL); + if (!oldbuf) + return -ENOMEM; + + memcpy(oldbuf, &tmpbuf, sizeof(*oldbuf)); + buf = (char *)oldbuf + sizeof(*oldbuf); + len = atomic_read(&oldbuf->datalen); + start = atomic_read(&oldbuf->start); + off = zone->off + sizeof(*oldbuf); + + /* get part of data */ + rcnt = info->read(buf, len - start, off + start); + if (rcnt != len - start) { + pr_err("read zone %s failed\n", zone->name); + ret = (int)rcnt < 0 ? (int)rcnt : -EIO; + goto free_oldbuf; + } + + /* get the rest of data */ + rcnt = info->read(buf + len - start, start, off); + if (rcnt != start) { + pr_err("read zone %s failed\n", zone->name); + ret = (int)rcnt < 0 ? (int)rcnt : -EIO; + goto free_oldbuf; + } + + zone->oldbuf = oldbuf; + psz_flush_dirty_zone(zone); + return 0; + +free_oldbuf: + kfree(oldbuf); + return ret; +} + +static int psz_recover_zones(struct psz_context *cxt, + struct pstore_zone **zones, unsigned int cnt) +{ + int ret; + unsigned int i; + struct pstore_zone *zone; + + if (!zones) + return 0; + + for (i = 0; i < cnt; i++) { + zone = zones[i]; + if (unlikely(!zone)) + continue; + ret = psz_recover_zone(cxt, zone); + if (ret) + goto recover_fail; + } + + return 0; +recover_fail: + pr_debug("recover %s[%u] failed\n", zone->name, i); + return ret; +} + +/** + * psz_recovery() - recover data from storage + * @cxt: the context of pstore/zone + * + * recovery means reading data back from storage after rebooting + * + * Return: 0 on success, others on failure. + */ +static inline int psz_recovery(struct psz_context *cxt) +{ + int ret; + + if (atomic_read(&cxt->recovered)) + return 0; + + ret = psz_kmsg_recover(cxt); + if (ret) + goto out; + + ret = psz_recover_zone(cxt, cxt->ppsz); + if (ret) + goto out; + + ret = psz_recover_zone(cxt, cxt->cpsz); + if (ret) + goto out; + + ret = psz_recover_zones(cxt, cxt->fpszs, cxt->ftrace_max_cnt); + +out: + if (unlikely(ret)) + pr_err("recover failed\n"); + else { + pr_debug("recover end!\n"); + atomic_set(&cxt->recovered, 1); + } + return ret; +} + +static int psz_pstore_open(struct pstore_info *psi) +{ + struct psz_context *cxt = psi->data; + + cxt->kmsg_read_cnt = 0; + cxt->pmsg_read_cnt = 0; + cxt->console_read_cnt = 0; + cxt->ftrace_read_cnt = 0; + return 0; +} + +static inline bool psz_old_ok(struct pstore_zone *zone) +{ + if (zone && zone->oldbuf && atomic_read(&zone->oldbuf->datalen)) + return true; + return false; +} + +static inline bool psz_ok(struct pstore_zone *zone) +{ + if (zone && zone->buffer && buffer_datalen(zone)) + return true; + return false; +} + +static inline int psz_kmsg_erase(struct psz_context *cxt, + struct pstore_zone *zone, struct pstore_record *record) +{ + struct psz_buffer *buffer = zone->buffer; + struct psz_kmsg_header *hdr = + (struct psz_kmsg_header *)buffer->data; + size_t size; + + if (unlikely(!psz_ok(zone))) + return 0; + + /* this zone is already updated, no need to erase */ + if (record->count != hdr->counter) + return 0; + + size = buffer_datalen(zone) + sizeof(*zone->buffer); + atomic_set(&zone->buffer->datalen, 0); + if (cxt->pstore_zone_info->erase) + return cxt->pstore_zone_info->erase(size, zone->off); + else + return psz_zone_write(zone, FLUSH_META, NULL, 0, 0); +} + +static inline int psz_record_erase(struct psz_context *cxt, + struct pstore_zone *zone) +{ + if (unlikely(!psz_old_ok(zone))) + return 0; + + kfree(zone->oldbuf); + zone->oldbuf = NULL; + /* + * if there are new data in zone buffer, that means the old data + * are already invalid. It is no need to flush 0 (erase) to + * block device. + */ + if (!buffer_datalen(zone)) + return psz_zone_write(zone, FLUSH_META, NULL, 0, 0); + psz_flush_dirty_zone(zone); + return 0; +} + +static int psz_pstore_erase(struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + + switch (record->type) { + case PSTORE_TYPE_DMESG: + if (record->id >= cxt->kmsg_max_cnt) + return -EINVAL; + return psz_kmsg_erase(cxt, cxt->kpszs[record->id], record); + case PSTORE_TYPE_PMSG: + return psz_record_erase(cxt, cxt->ppsz); + case PSTORE_TYPE_CONSOLE: + return psz_record_erase(cxt, cxt->cpsz); + case PSTORE_TYPE_FTRACE: + if (record->id >= cxt->ftrace_max_cnt) + return -EINVAL; + return psz_record_erase(cxt, cxt->fpszs[record->id]); + default: return -EINVAL; + } +} + +static void psz_write_kmsg_hdr(struct pstore_zone *zone, + struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + struct psz_buffer *buffer = zone->buffer; + struct psz_kmsg_header *hdr = + (struct psz_kmsg_header *)buffer->data; + + hdr->magic = PSTORE_KMSG_HEADER_MAGIC; + hdr->compressed = record->compressed; + hdr->time.tv_sec = record->time.tv_sec; + hdr->time.tv_nsec = record->time.tv_nsec; + hdr->reason = record->reason; + if (hdr->reason == KMSG_DUMP_OOPS) + hdr->counter = ++cxt->oops_counter; + else if (hdr->reason == KMSG_DUMP_PANIC) + hdr->counter = ++cxt->panic_counter; + else + hdr->counter = 0; +} + +/* + * In case zone is broken, which may occur to MTD device, we try each zones, + * start at cxt->kmsg_write_cnt. + */ +static inline int notrace psz_kmsg_write_record(struct psz_context *cxt, + struct pstore_record *record) +{ + size_t size, hlen; + struct pstore_zone *zone; + unsigned int i; + + for (i = 0; i < cxt->kmsg_max_cnt; i++) { + unsigned int zonenum, len; + int ret; + + zonenum = (cxt->kmsg_write_cnt + i) % cxt->kmsg_max_cnt; + zone = cxt->kpszs[zonenum]; + if (unlikely(!zone)) + return -ENOSPC; + + /* avoid destroying old data, allocate a new one */ + len = zone->buffer_size + sizeof(*zone->buffer); + zone->oldbuf = zone->buffer; + zone->buffer = kzalloc(len, GFP_KERNEL); + if (!zone->buffer) { + zone->buffer = zone->oldbuf; + return -ENOMEM; + } + zone->buffer->sig = zone->oldbuf->sig; + + pr_debug("write %s to zone id %d\n", zone->name, zonenum); + psz_write_kmsg_hdr(zone, record); + hlen = sizeof(struct psz_kmsg_header); + size = min_t(size_t, record->size, zone->buffer_size - hlen); + ret = psz_zone_write(zone, FLUSH_ALL, record->buf, size, hlen); + if (likely(!ret || ret != -ENOMSG)) { + cxt->kmsg_write_cnt = zonenum + 1; + cxt->kmsg_write_cnt %= cxt->kmsg_max_cnt; + /* no need to try next zone, free last zone buffer */ + kfree(zone->oldbuf); + zone->oldbuf = NULL; + return ret; + } + + pr_debug("zone %u may be broken, try next dmesg zone\n", + zonenum); + kfree(zone->buffer); + zone->buffer = zone->oldbuf; + zone->oldbuf = NULL; + } + + return -EBUSY; +} + +static int notrace psz_kmsg_write(struct psz_context *cxt, + struct pstore_record *record) +{ + int ret; + + /* + * Explicitly only take the first part of any new crash. + * If our buffer is larger than kmsg_bytes, this can never happen, + * and if our buffer is smaller than kmsg_bytes, we don't want the + * report split across multiple records. + */ + if (record->part != 1) + return -ENOSPC; + + if (!cxt->kpszs) + return -ENOSPC; + + ret = psz_kmsg_write_record(cxt, record); + if (!ret && is_on_panic()) { + /* ensure all data are flushed to storage when panic */ + pr_debug("try to flush other dirty zones\n"); + psz_flush_all_dirty_zones(NULL); + } + + /* always return 0 as we had handled it on buffer */ + return 0; +} + +static int notrace psz_record_write(struct pstore_zone *zone, + struct pstore_record *record) +{ + size_t start, rem; + bool is_full_data = false; + char *buf; + int cnt; + + if (!zone || !record) + return -ENOSPC; + + if (atomic_read(&zone->buffer->datalen) >= zone->buffer_size) + is_full_data = true; + + cnt = record->size; + buf = record->buf; + if (unlikely(cnt > zone->buffer_size)) { + buf += cnt - zone->buffer_size; + cnt = zone->buffer_size; + } + + start = buffer_start(zone); + rem = zone->buffer_size - start; + if (unlikely(rem < cnt)) { + psz_zone_write(zone, FLUSH_PART, buf, rem, start); + buf += rem; + cnt -= rem; + start = 0; + is_full_data = true; + } + + atomic_set(&zone->buffer->start, cnt + start); + psz_zone_write(zone, FLUSH_PART, buf, cnt, start); + + /** + * psz_zone_write will set datalen as start + cnt. + * It work if actual data length lesser than buffer size. + * If data length greater than buffer size, pmsg will rewrite to + * beginning of zone, which make buffer->datalen wrongly. + * So we should reset datalen as buffer size once actual data length + * greater than buffer size. + */ + if (is_full_data) { + atomic_set(&zone->buffer->datalen, zone->buffer_size); + psz_zone_write(zone, FLUSH_META, NULL, 0, 0); + } + return 0; +} + +static int notrace psz_pstore_write(struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + + if (record->type == PSTORE_TYPE_DMESG && + record->reason == KMSG_DUMP_PANIC) + atomic_set(&cxt->on_panic, 1); + + /* + * if on panic, do not write except panic records + * Fix case that panic_write prints log which wakes up console backend. + */ + if (is_on_panic() && record->type != PSTORE_TYPE_DMESG) + return -EBUSY; + + switch (record->type) { + case PSTORE_TYPE_DMESG: + return psz_kmsg_write(cxt, record); + case PSTORE_TYPE_CONSOLE: + return psz_record_write(cxt->cpsz, record); + case PSTORE_TYPE_PMSG: + return psz_record_write(cxt->ppsz, record); + case PSTORE_TYPE_FTRACE: { + int zonenum = smp_processor_id(); + + if (!cxt->fpszs) + return -ENOSPC; + return psz_record_write(cxt->fpszs[zonenum], record); + } + default: + return -EINVAL; + } +} + +static struct pstore_zone *psz_read_next_zone(struct psz_context *cxt) +{ + struct pstore_zone *zone = NULL; + + while (cxt->kmsg_read_cnt < cxt->kmsg_max_cnt) { + zone = cxt->kpszs[cxt->kmsg_read_cnt++]; + if (psz_ok(zone)) + return zone; + } + + if (cxt->ftrace_read_cnt < cxt->ftrace_max_cnt) + /* + * No need psz_old_ok(). Let psz_ftrace_read() do so for + * combination. psz_ftrace_read() should traverse over + * all zones in case of some zone without data. + */ + return cxt->fpszs[cxt->ftrace_read_cnt++]; + + if (cxt->pmsg_read_cnt == 0) { + cxt->pmsg_read_cnt++; + zone = cxt->ppsz; + if (psz_old_ok(zone)) + return zone; + } + + if (cxt->console_read_cnt == 0) { + cxt->console_read_cnt++; + zone = cxt->cpsz; + if (psz_old_ok(zone)) + return zone; + } + + return NULL; +} + +static int psz_kmsg_read_hdr(struct pstore_zone *zone, + struct pstore_record *record) +{ + struct psz_buffer *buffer = zone->buffer; + struct psz_kmsg_header *hdr = + (struct psz_kmsg_header *)buffer->data; + + if (hdr->magic != PSTORE_KMSG_HEADER_MAGIC) + return -EINVAL; + record->compressed = hdr->compressed; + record->time.tv_sec = hdr->time.tv_sec; + record->time.tv_nsec = hdr->time.tv_nsec; + record->reason = hdr->reason; + record->count = hdr->counter; + return 0; +} + +static ssize_t psz_kmsg_read(struct pstore_zone *zone, + struct pstore_record *record) +{ + ssize_t size, hlen = 0; + + size = buffer_datalen(zone); + /* Clear and skip this kmsg dump record if it has no valid header */ + if (psz_kmsg_read_hdr(zone, record)) { + atomic_set(&zone->buffer->datalen, 0); + atomic_set(&zone->dirty, 0); + return -ENOMSG; + } + size -= sizeof(struct psz_kmsg_header); + + if (!record->compressed) { + char *buf = kasprintf(GFP_KERNEL, "%s: Total %d times\n", + kmsg_dump_reason_str(record->reason), + record->count); + hlen = strlen(buf); + record->buf = krealloc(buf, hlen + size, GFP_KERNEL); + if (!record->buf) { + kfree(buf); + return -ENOMEM; + } + } else { + record->buf = kmalloc(size, GFP_KERNEL); + if (!record->buf) + return -ENOMEM; + } + + size = psz_zone_read_buffer(zone, record->buf + hlen, size, + sizeof(struct psz_kmsg_header)); + if (unlikely(size < 0)) { + kfree(record->buf); + return -ENOMSG; + } + + return size + hlen; +} + +/* try to combine all ftrace zones */ +static ssize_t psz_ftrace_read(struct pstore_zone *zone, + struct pstore_record *record) +{ + struct psz_context *cxt; + struct psz_buffer *buf; + int ret; + + if (!zone || !record) + return -ENOSPC; + + if (!psz_old_ok(zone)) + goto out; + + buf = (struct psz_buffer *)zone->oldbuf; + if (!buf) + return -ENOMSG; + + ret = pstore_ftrace_combine_log(&record->buf, &record->size, + (char *)buf->data, atomic_read(&buf->datalen)); + if (unlikely(ret)) + return ret; + +out: + cxt = record->psi->data; + if (cxt->ftrace_read_cnt < cxt->ftrace_max_cnt) + /* then, read next ftrace zone */ + return -ENOMSG; + record->id = 0; + return record->size ? record->size : -ENOMSG; +} + +static ssize_t psz_record_read(struct pstore_zone *zone, + struct pstore_record *record) +{ + size_t len; + struct psz_buffer *buf; + + if (!zone || !record) + return -ENOSPC; + + buf = (struct psz_buffer *)zone->oldbuf; + if (!buf) + return -ENOMSG; + + len = atomic_read(&buf->datalen); + record->buf = kmalloc(len, GFP_KERNEL); + if (!record->buf) + return -ENOMEM; + + if (unlikely(psz_zone_read_oldbuf(zone, record->buf, len, 0))) { + kfree(record->buf); + return -ENOMSG; + } + + return len; +} + +static ssize_t psz_pstore_read(struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + ssize_t (*readop)(struct pstore_zone *zone, + struct pstore_record *record); + struct pstore_zone *zone; + ssize_t ret; + + /* before read, we must recover from storage */ + ret = psz_recovery(cxt); + if (ret) + return ret; + +next_zone: + zone = psz_read_next_zone(cxt); + if (!zone) + return 0; + + record->type = zone->type; + switch (record->type) { + case PSTORE_TYPE_DMESG: + readop = psz_kmsg_read; + record->id = cxt->kmsg_read_cnt - 1; + break; + case PSTORE_TYPE_FTRACE: + readop = psz_ftrace_read; + break; + case PSTORE_TYPE_CONSOLE: + fallthrough; + case PSTORE_TYPE_PMSG: + readop = psz_record_read; + break; + default: + goto next_zone; + } + + ret = readop(zone, record); + if (ret == -ENOMSG) + goto next_zone; + return ret; +} + +static struct psz_context pstore_zone_cxt = { + .pstore_zone_info_lock = + __MUTEX_INITIALIZER(pstore_zone_cxt.pstore_zone_info_lock), + .recovered = ATOMIC_INIT(0), + .on_panic = ATOMIC_INIT(0), + .pstore = { + .owner = THIS_MODULE, + .open = psz_pstore_open, + .read = psz_pstore_read, + .write = psz_pstore_write, + .erase = psz_pstore_erase, + }, +}; + +static void psz_free_zone(struct pstore_zone **pszone) +{ + struct pstore_zone *zone = *pszone; + + if (!zone) + return; + + kfree(zone->buffer); + kfree(zone); + *pszone = NULL; +} + +static void psz_free_zones(struct pstore_zone ***pszones, unsigned int *cnt) +{ + struct pstore_zone **zones = *pszones; + + if (!zones) + return; + + while (*cnt > 0) { + (*cnt)--; + psz_free_zone(&(zones[*cnt])); + } + kfree(zones); + *pszones = NULL; +} + +static void psz_free_all_zones(struct psz_context *cxt) +{ + if (cxt->kpszs) + psz_free_zones(&cxt->kpszs, &cxt->kmsg_max_cnt); + if (cxt->ppsz) + psz_free_zone(&cxt->ppsz); + if (cxt->cpsz) + psz_free_zone(&cxt->cpsz); + if (cxt->fpszs) + psz_free_zones(&cxt->fpszs, &cxt->ftrace_max_cnt); +} + +static struct pstore_zone *psz_init_zone(enum pstore_type_id type, + loff_t *off, size_t size) +{ + struct pstore_zone_info *info = pstore_zone_cxt.pstore_zone_info; + struct pstore_zone *zone; + const char *name = pstore_type_to_name(type); + + if (!size) + return NULL; + + if (*off + size > info->total_size) { + pr_err("no room for %s (0x%zx@0x%llx over 0x%lx)\n", + name, size, *off, info->total_size); + return ERR_PTR(-ENOMEM); + } + + zone = kzalloc(sizeof(struct pstore_zone), GFP_KERNEL); + if (!zone) + return ERR_PTR(-ENOMEM); + + zone->buffer = kmalloc(size, GFP_KERNEL); + if (!zone->buffer) { + kfree(zone); + return ERR_PTR(-ENOMEM); + } + memset(zone->buffer, 0xFF, size); + zone->off = *off; + zone->name = name; + zone->type = type; + zone->buffer_size = size - sizeof(struct psz_buffer); + zone->buffer->sig = type ^ PSZ_SIG; + zone->oldbuf = NULL; + atomic_set(&zone->dirty, 0); + atomic_set(&zone->buffer->datalen, 0); + atomic_set(&zone->buffer->start, 0); + + *off += size; + + pr_debug("pszone %s: off 0x%llx, %zu header, %zu data\n", zone->name, + zone->off, sizeof(*zone->buffer), zone->buffer_size); + return zone; +} + +static struct pstore_zone **psz_init_zones(enum pstore_type_id type, + loff_t *off, size_t total_size, ssize_t record_size, + unsigned int *cnt) +{ + struct pstore_zone_info *info = pstore_zone_cxt.pstore_zone_info; + struct pstore_zone **zones, *zone; + const char *name = pstore_type_to_name(type); + int c, i; + + *cnt = 0; + if (!total_size || !record_size) + return NULL; + + if (*off + total_size > info->total_size) { + pr_err("no room for zones %s (0x%zx@0x%llx over 0x%lx)\n", + name, total_size, *off, info->total_size); + return ERR_PTR(-ENOMEM); + } + + c = total_size / record_size; + zones = kcalloc(c, sizeof(*zones), GFP_KERNEL); + if (!zones) { + pr_err("allocate for zones %s failed\n", name); + return ERR_PTR(-ENOMEM); + } + memset(zones, 0, c * sizeof(*zones)); + + for (i = 0; i < c; i++) { + zone = psz_init_zone(type, off, record_size); + if (!zone || IS_ERR(zone)) { + pr_err("initialize zones %s failed\n", name); + psz_free_zones(&zones, &i); + return (void *)zone; + } + zones[i] = zone; + } + + *cnt = c; + return zones; +} + +static int psz_alloc_zones(struct psz_context *cxt) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + loff_t off = 0; + int err; + size_t off_size = 0; + + off_size += info->pmsg_size; + cxt->ppsz = psz_init_zone(PSTORE_TYPE_PMSG, &off, info->pmsg_size); + if (IS_ERR(cxt->ppsz)) { + err = PTR_ERR(cxt->ppsz); + cxt->ppsz = NULL; + goto free_out; + } + + off_size += info->console_size; + cxt->cpsz = psz_init_zone(PSTORE_TYPE_CONSOLE, &off, + info->console_size); + if (IS_ERR(cxt->cpsz)) { + err = PTR_ERR(cxt->cpsz); + cxt->cpsz = NULL; + goto free_out; + } + + off_size += info->ftrace_size; + cxt->fpszs = psz_init_zones(PSTORE_TYPE_FTRACE, &off, + info->ftrace_size, + info->ftrace_size / nr_cpu_ids, + &cxt->ftrace_max_cnt); + if (IS_ERR(cxt->fpszs)) { + err = PTR_ERR(cxt->fpszs); + cxt->fpszs = NULL; + goto free_out; + } + + cxt->kpszs = psz_init_zones(PSTORE_TYPE_DMESG, &off, + info->total_size - off_size, + info->kmsg_size, &cxt->kmsg_max_cnt); + if (IS_ERR(cxt->kpszs)) { + err = PTR_ERR(cxt->kpszs); + cxt->kpszs = NULL; + goto free_out; + } + + return 0; +free_out: + psz_free_all_zones(cxt); + return err; +} + +/** + * register_pstore_zone() - register to pstore/zone + * + * @info: back-end driver information. See &struct pstore_zone_info. + * + * Only one back-end at one time. + * + * Return: 0 on success, others on failure. + */ +int register_pstore_zone(struct pstore_zone_info *info) +{ + int err = -EINVAL; + struct psz_context *cxt = &pstore_zone_cxt; + + if (info->total_size < 4096) { + pr_warn("total_size must be >= 4096\n"); + return -EINVAL; + } + + if (!info->kmsg_size && !info->pmsg_size && !info->console_size && + !info->ftrace_size) { + pr_warn("at least one record size must be non-zero\n"); + return -EINVAL; + } + + if (!info->name || !info->name[0]) + return -EINVAL; + +#define check_size(name, size) { \ + if (info->name > 0 && info->name < (size)) { \ + pr_err(#name " must be over %d\n", (size)); \ + return -EINVAL; \ + } \ + if (info->name & (size - 1)) { \ + pr_err(#name " must be a multiple of %d\n", \ + (size)); \ + return -EINVAL; \ + } \ + } + + check_size(total_size, 4096); + check_size(kmsg_size, SECTOR_SIZE); + check_size(pmsg_size, SECTOR_SIZE); + check_size(console_size, SECTOR_SIZE); + check_size(ftrace_size, SECTOR_SIZE); + +#undef check_size + + /* + * the @read and @write must be applied. + * if no @read, pstore may mount failed. + * if no @write, pstore do not support to remove record file. + */ + if (!info->read || !info->write) { + pr_err("no valid general read/write interface\n"); + return -EINVAL; + } + + mutex_lock(&cxt->pstore_zone_info_lock); + if (cxt->pstore_zone_info) { + pr_warn("'%s' already loaded: ignoring '%s'\n", + cxt->pstore_zone_info->name, info->name); + mutex_unlock(&cxt->pstore_zone_info_lock); + return -EBUSY; + } + cxt->pstore_zone_info = info; + + pr_debug("register %s with properties:\n", info->name); + pr_debug("\ttotal size : %ld Bytes\n", info->total_size); + pr_debug("\tkmsg size : %ld Bytes\n", info->kmsg_size); + pr_debug("\tpmsg size : %ld Bytes\n", info->pmsg_size); + pr_debug("\tconsole size : %ld Bytes\n", info->console_size); + pr_debug("\tftrace size : %ld Bytes\n", info->ftrace_size); + + err = psz_alloc_zones(cxt); + if (err) { + pr_err("alloc zones failed\n"); + goto fail_out; + } + + if (info->kmsg_size) { + cxt->pstore.bufsize = cxt->kpszs[0]->buffer_size - + sizeof(struct psz_kmsg_header); + cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL); + if (!cxt->pstore.buf) { + err = -ENOMEM; + goto fail_free; + } + } + cxt->pstore.data = cxt; + + pr_info("registered %s as backend for", info->name); + cxt->pstore.max_reason = info->max_reason; + cxt->pstore.name = info->name; + if (info->kmsg_size) { + cxt->pstore.flags |= PSTORE_FLAGS_DMESG; + pr_cont(" kmsg(%s", + kmsg_dump_reason_str(cxt->pstore.max_reason)); + if (cxt->pstore_zone_info->panic_write) + pr_cont(",panic_write"); + pr_cont(")"); + } + if (info->pmsg_size) { + cxt->pstore.flags |= PSTORE_FLAGS_PMSG; + pr_cont(" pmsg"); + } + if (info->console_size) { + cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE; + pr_cont(" console"); + } + if (info->ftrace_size) { + cxt->pstore.flags |= PSTORE_FLAGS_FTRACE; + pr_cont(" ftrace"); + } + pr_cont("\n"); + + err = pstore_register(&cxt->pstore); + if (err) { + pr_err("registering with pstore failed\n"); + goto fail_free; + } + mutex_unlock(&pstore_zone_cxt.pstore_zone_info_lock); + + return 0; + +fail_free: + kfree(cxt->pstore.buf); + cxt->pstore.buf = NULL; + cxt->pstore.bufsize = 0; + psz_free_all_zones(cxt); +fail_out: + pstore_zone_cxt.pstore_zone_info = NULL; + mutex_unlock(&pstore_zone_cxt.pstore_zone_info_lock); + return err; +} +EXPORT_SYMBOL_GPL(register_pstore_zone); + +/** + * unregister_pstore_zone() - unregister to pstore/zone + * + * @info: back-end driver information. See struct pstore_zone_info. + */ +void unregister_pstore_zone(struct pstore_zone_info *info) +{ + struct psz_context *cxt = &pstore_zone_cxt; + + mutex_lock(&cxt->pstore_zone_info_lock); + if (!cxt->pstore_zone_info) { + mutex_unlock(&cxt->pstore_zone_info_lock); + return; + } + + /* Stop incoming writes from pstore. */ + pstore_unregister(&cxt->pstore); + + /* Flush any pending writes. */ + psz_flush_all_dirty_zones(NULL); + flush_delayed_work(&psz_cleaner); + + /* Clean up allocations. */ + kfree(cxt->pstore.buf); + cxt->pstore.buf = NULL; + cxt->pstore.bufsize = 0; + cxt->pstore_zone_info = NULL; + + psz_free_all_zones(cxt); + + /* Clear counters and zone state. */ + cxt->oops_counter = 0; + cxt->panic_counter = 0; + atomic_set(&cxt->recovered, 0); + atomic_set(&cxt->on_panic, 0); + + mutex_unlock(&cxt->pstore_zone_info_lock); +} +EXPORT_SYMBOL_GPL(unregister_pstore_zone); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("WeiXiong Liao <liaoweixiong@allwinnertech.com>"); +MODULE_AUTHOR("Kees Cook <keescook@chromium.org>"); +MODULE_DESCRIPTION("Storage Manager for pstore/blk"); diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 345db56c98fd..755293c8c71a 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -99,10 +99,9 @@ static int qnx6_readpage(struct file *file, struct page *page) return mpage_readpage(page, qnx6_get_block); } -static int qnx6_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void qnx6_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, qnx6_get_block); + mpage_readahead(rac, qnx6_get_block); } /* @@ -499,7 +498,7 @@ static sector_t qnx6_bmap(struct address_space *mapping, sector_t block) } static const struct address_space_operations qnx6_aops = { .readpage = qnx6_readpage, - .readpages = qnx6_readpages, + .readahead = qnx6_readahead, .bmap = qnx6_bmap }; diff --git a/fs/readdir.c b/fs/readdir.c index de2eceffdee8..a49f07c11cfb 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -157,17 +157,18 @@ static int fillonedir(struct dir_context *ctx, const char *name, int namlen, } buf->result++; dirent = buf->dirent; - if (!access_ok(dirent, + if (!user_write_access_begin(dirent, (unsigned long)(dirent->d_name + namlen + 1) - (unsigned long)dirent)) goto efault; - if ( __put_user(d_ino, &dirent->d_ino) || - __put_user(offset, &dirent->d_offset) || - __put_user(namlen, &dirent->d_namlen) || - __copy_to_user(dirent->d_name, name, namlen) || - __put_user(0, dirent->d_name + namlen)) - goto efault; + unsafe_put_user(d_ino, &dirent->d_ino, efault_end); + unsafe_put_user(offset, &dirent->d_offset, efault_end); + unsafe_put_user(namlen, &dirent->d_namlen, efault_end); + unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); + user_write_access_end(); return 0; +efault_end: + user_write_access_end(); efault: buf->result = -EFAULT; return -EFAULT; @@ -242,7 +243,7 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, return -EINTR; dirent = buf->current_dir; prev = (void __user *) dirent - prev_reclen; - if (!user_access_begin(prev, reclen + prev_reclen)) + if (!user_write_access_begin(prev, reclen + prev_reclen)) goto efault; /* This might be 'dirent->d_off', but if so it will get overwritten */ @@ -251,14 +252,14 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, unsafe_put_user(reclen, &dirent->d_reclen, efault_end); unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); - user_access_end(); + user_write_access_end(); buf->current_dir = (void __user *)dirent + reclen; buf->prev_reclen = reclen; buf->count -= reclen; return 0; efault_end: - user_access_end(); + user_write_access_end(); efault: buf->error = -EFAULT; return -EFAULT; @@ -275,9 +276,6 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, }; int error; - if (!access_ok(dirent, count)) - return -EFAULT; - f = fdget_pos(fd); if (!f.file) return -EBADF; @@ -327,7 +325,7 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, return -EINTR; dirent = buf->current_dir; prev = (void __user *)dirent - prev_reclen; - if (!user_access_begin(prev, reclen + prev_reclen)) + if (!user_write_access_begin(prev, reclen + prev_reclen)) goto efault; /* This might be 'dirent->d_off', but if so it will get overwritten */ @@ -336,7 +334,7 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, unsafe_put_user(reclen, &dirent->d_reclen, efault_end); unsafe_put_user(d_type, &dirent->d_type, efault_end); unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); - user_access_end(); + user_write_access_end(); buf->prev_reclen = reclen; buf->current_dir = (void __user *)dirent + reclen; @@ -344,7 +342,7 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, return 0; efault_end: - user_access_end(); + user_write_access_end(); efault: buf->error = -EFAULT; return -EFAULT; @@ -361,9 +359,6 @@ int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent, }; int error; - if (!access_ok(dirent, count)) - return -EFAULT; - f = fdget_pos(fd); if (!f.file) return -EBADF; @@ -376,7 +371,7 @@ int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent, typeof(lastdirent->d_off) d_off = buf.ctx.pos; lastdirent = (void __user *) buf.current_dir - buf.prev_reclen; - if (__put_user(d_off, &lastdirent->d_off)) + if (put_user(d_off, &lastdirent->d_off)) error = -EFAULT; else error = count - buf.count; @@ -424,17 +419,18 @@ static int compat_fillonedir(struct dir_context *ctx, const char *name, } buf->result++; dirent = buf->dirent; - if (!access_ok(dirent, + if (!user_write_access_begin(dirent, (unsigned long)(dirent->d_name + namlen + 1) - (unsigned long)dirent)) goto efault; - if ( __put_user(d_ino, &dirent->d_ino) || - __put_user(offset, &dirent->d_offset) || - __put_user(namlen, &dirent->d_namlen) || - __copy_to_user(dirent->d_name, name, namlen) || - __put_user(0, dirent->d_name + namlen)) - goto efault; + unsafe_put_user(d_ino, &dirent->d_ino, efault_end); + unsafe_put_user(offset, &dirent->d_offset, efault_end); + unsafe_put_user(namlen, &dirent->d_namlen, efault_end); + unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); + user_write_access_end(); return 0; +efault_end: + user_write_access_end(); efault: buf->result = -EFAULT; return -EFAULT; @@ -471,7 +467,7 @@ struct compat_linux_dirent { struct compat_getdents_callback { struct dir_context ctx; struct compat_linux_dirent __user *current_dir; - struct compat_linux_dirent __user *previous; + int prev_reclen; int count; int error; }; @@ -479,13 +475,17 @@ struct compat_getdents_callback { static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { - struct compat_linux_dirent __user * dirent; + struct compat_linux_dirent __user *dirent, *prev; struct compat_getdents_callback *buf = container_of(ctx, struct compat_getdents_callback, ctx); compat_ulong_t d_ino; int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) + namlen + 2, sizeof(compat_long_t)); + int prev_reclen; + buf->error = verify_dirent_name(name, namlen); + if (unlikely(buf->error)) + return buf->error; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; @@ -494,29 +494,27 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, buf->error = -EOVERFLOW; return -EOVERFLOW; } - dirent = buf->previous; - if (dirent) { - if (signal_pending(current)) - return -EINTR; - if (__put_user(offset, &dirent->d_off)) - goto efault; - } + prev_reclen = buf->prev_reclen; + if (prev_reclen && signal_pending(current)) + return -EINTR; dirent = buf->current_dir; - if (__put_user(d_ino, &dirent->d_ino)) - goto efault; - if (__put_user(reclen, &dirent->d_reclen)) - goto efault; - if (copy_to_user(dirent->d_name, name, namlen)) - goto efault; - if (__put_user(0, dirent->d_name + namlen)) - goto efault; - if (__put_user(d_type, (char __user *) dirent + reclen - 1)) + prev = (void __user *) dirent - prev_reclen; + if (!user_write_access_begin(prev, reclen + prev_reclen)) goto efault; - buf->previous = dirent; - dirent = (void __user *)dirent + reclen; - buf->current_dir = dirent; + + unsafe_put_user(offset, &prev->d_off, efault_end); + unsafe_put_user(d_ino, &dirent->d_ino, efault_end); + unsafe_put_user(reclen, &dirent->d_reclen, efault_end); + unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end); + unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); + user_write_access_end(); + + buf->prev_reclen = reclen; + buf->current_dir = (void __user *)dirent + reclen; buf->count -= reclen; return 0; +efault_end: + user_write_access_end(); efault: buf->error = -EFAULT; return -EFAULT; @@ -526,7 +524,6 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, struct compat_linux_dirent __user *, dirent, unsigned int, count) { struct fd f; - struct compat_linux_dirent __user * lastdirent; struct compat_getdents_callback buf = { .ctx.actor = compat_filldir, .current_dir = dirent, @@ -534,9 +531,6 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, }; int error; - if (!access_ok(dirent, count)) - return -EFAULT; - f = fdget_pos(fd); if (!f.file) return -EBADF; @@ -544,8 +538,10 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, error = iterate_dir(f.file, &buf.ctx); if (error >= 0) error = buf.error; - lastdirent = buf.previous; - if (lastdirent) { + if (buf.prev_reclen) { + struct compat_linux_dirent __user * lastdirent; + lastdirent = (void __user *)buf.current_dir - buf.prev_reclen; + if (put_user(buf.ctx.pos, &lastdirent->d_off)) error = -EFAULT; else diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 84cf8bdbec9c..0b641ae694f1 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -159,7 +159,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, barrier_done = reiserfs_commit_for_inode(inode); reiserfs_write_unlock(inode->i_sb); if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) - blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); inode_unlock(inode); if (barrier_done < 0) return barrier_done; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 6419e6dacc39..0031070b3692 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1160,11 +1160,9 @@ failure: return retval; } -static int -reiserfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void reiserfs_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block); + mpage_readahead(rac, reiserfs_get_block); } /* @@ -3434,7 +3432,7 @@ out: const struct address_space_operations reiserfs_address_space_operations = { .writepage = reiserfs_writepage, .readpage = reiserfs_readpage, - .readpages = reiserfs_readpages, + .readahead = reiserfs_readahead, .releasepage = reiserfs_releasepage, .invalidatepage = reiserfs_invalidatepage, .write_begin = reiserfs_write_begin, diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig index ad4c45788896..9737b8e68878 100644 --- a/fs/romfs/Kconfig +++ b/fs/romfs/Kconfig @@ -6,7 +6,7 @@ config ROMFS_FS This is a very small read-only file system mainly intended for initial ram disks of installation disks, but it could be used for other read-only media as well. Read - <file:Documentation/filesystems/romfs.txt> for details. + <file:Documentation/filesystems/romfs.rst> for details. To compile this file system support as a module, choose M here: the module will be called romfs. Note that the file system of your diff --git a/fs/splice.c b/fs/splice.c index 4e53efbd621d..5013565eb756 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1754,8 +1754,7 @@ static int link_pipe(struct pipe_inode_info *ipipe, * The 'flags' used are the SPLICE_F_* variants, currently the only * applicable one is SPLICE_F_NONBLOCK. */ -static long do_tee(struct file *in, struct file *out, size_t len, - unsigned int flags) +long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { struct pipe_inode_info *ipipe = get_pipe_info(in); struct pipe_inode_info *opipe = get_pipe_info(out); diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 4f9b9fb59362..64f61330564a 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -13,6 +13,7 @@ * datablocks and metadata blocks. */ +#include <linux/blkdev.h> #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> @@ -27,44 +28,103 @@ #include "page_actor.h" /* - * Read the metadata block length, this is stored in the first two - * bytes of the metadata block. + * Returns the amount of bytes copied to the page actor. */ -static struct buffer_head *get_block_length(struct super_block *sb, - u64 *cur_index, int *offset, int *length) +static int copy_bio_to_actor(struct bio *bio, + struct squashfs_page_actor *actor, + int offset, int req_length) +{ + void *actor_addr = squashfs_first_page(actor); + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); + int copied_bytes = 0; + int actor_offset = 0; + + if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) + return 0; + + while (copied_bytes < req_length) { + int bytes_to_copy = min_t(int, bvec->bv_len - offset, + PAGE_SIZE - actor_offset); + + bytes_to_copy = min_t(int, bytes_to_copy, + req_length - copied_bytes); + memcpy(actor_addr + actor_offset, + page_address(bvec->bv_page) + bvec->bv_offset + offset, + bytes_to_copy); + + actor_offset += bytes_to_copy; + copied_bytes += bytes_to_copy; + offset += bytes_to_copy; + + if (actor_offset >= PAGE_SIZE) { + actor_addr = squashfs_next_page(actor); + if (!actor_addr) + break; + actor_offset = 0; + } + if (offset >= bvec->bv_len) { + if (!bio_next_segment(bio, &iter_all)) + break; + offset = 0; + } + } + squashfs_finish_page(actor); + return copied_bytes; +} + +static int squashfs_bio_read(struct super_block *sb, u64 index, int length, + struct bio **biop, int *block_offset) { struct squashfs_sb_info *msblk = sb->s_fs_info; - struct buffer_head *bh; - - bh = sb_bread(sb, *cur_index); - if (bh == NULL) - return NULL; - - if (msblk->devblksize - *offset == 1) { - *length = (unsigned char) bh->b_data[*offset]; - put_bh(bh); - bh = sb_bread(sb, ++(*cur_index)); - if (bh == NULL) - return NULL; - *length |= (unsigned char) bh->b_data[0] << 8; - *offset = 1; - } else { - *length = (unsigned char) bh->b_data[*offset] | - (unsigned char) bh->b_data[*offset + 1] << 8; - *offset += 2; - - if (*offset == msblk->devblksize) { - put_bh(bh); - bh = sb_bread(sb, ++(*cur_index)); - if (bh == NULL) - return NULL; - *offset = 0; + const u64 read_start = round_down(index, msblk->devblksize); + const sector_t block = read_start >> msblk->devblksize_log2; + const u64 read_end = round_up(index + length, msblk->devblksize); + const sector_t block_end = read_end >> msblk->devblksize_log2; + int offset = read_start - round_down(index, PAGE_SIZE); + int total_len = (block_end - block) << msblk->devblksize_log2; + const int page_count = DIV_ROUND_UP(total_len + offset, PAGE_SIZE); + int error, i; + struct bio *bio; + + bio = bio_alloc(GFP_NOIO, page_count); + if (!bio) + return -ENOMEM; + + bio_set_dev(bio, sb->s_bdev); + bio->bi_opf = READ; + bio->bi_iter.bi_sector = block * (msblk->devblksize >> SECTOR_SHIFT); + + for (i = 0; i < page_count; ++i) { + unsigned int len = + min_t(unsigned int, PAGE_SIZE - offset, total_len); + struct page *page = alloc_page(GFP_NOIO); + + if (!page) { + error = -ENOMEM; + goto out_free_bio; + } + if (!bio_add_page(bio, page, len, offset)) { + error = -EIO; + goto out_free_bio; } + offset = 0; + total_len -= len; } - return bh; -} + error = submit_bio_wait(bio); + if (error) + goto out_free_bio; + *biop = bio; + *block_offset = index & ((1 << msblk->devblksize_log2) - 1); + return 0; + +out_free_bio: + bio_free_pages(bio); + bio_put(bio); + return error; +} /* * Read and decompress a metadata block or datablock. Length is non-zero @@ -76,129 +136,88 @@ static struct buffer_head *get_block_length(struct super_block *sb, * algorithms). */ int squashfs_read_data(struct super_block *sb, u64 index, int length, - u64 *next_index, struct squashfs_page_actor *output) + u64 *next_index, struct squashfs_page_actor *output) { struct squashfs_sb_info *msblk = sb->s_fs_info; - struct buffer_head **bh; - int offset = index & ((1 << msblk->devblksize_log2) - 1); - u64 cur_index = index >> msblk->devblksize_log2; - int bytes, compressed, b = 0, k = 0, avail, i; - - bh = kcalloc(((output->length + msblk->devblksize - 1) - >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL); - if (bh == NULL) - return -ENOMEM; + struct bio *bio = NULL; + int compressed; + int res; + int offset; if (length) { /* * Datablock. */ - bytes = -offset; compressed = SQUASHFS_COMPRESSED_BLOCK(length); length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length); - if (next_index) - *next_index = index + length; - TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n", index, compressed ? "" : "un", length, output->length); - - if (length < 0 || length > output->length || - (index + length) > msblk->bytes_used) - goto read_failure; - - for (b = 0; bytes < length; b++, cur_index++) { - bh[b] = sb_getblk(sb, cur_index); - if (bh[b] == NULL) - goto block_release; - bytes += msblk->devblksize; - } - ll_rw_block(REQ_OP_READ, 0, b, bh); } else { /* * Metadata block. */ - if ((index + 2) > msblk->bytes_used) - goto read_failure; + const u8 *data; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); - bh[0] = get_block_length(sb, &cur_index, &offset, &length); - if (bh[0] == NULL) - goto read_failure; - b = 1; + if (index + 2 > msblk->bytes_used) { + res = -EIO; + goto out; + } + res = squashfs_bio_read(sb, index, 2, &bio, &offset); + if (res) + goto out; + + if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) { + res = -EIO; + goto out_free_bio; + } + /* Extract the length of the metadata block */ + data = page_address(bvec->bv_page) + bvec->bv_offset; + length = data[offset]; + if (offset <= bvec->bv_len - 1) { + length |= data[offset + 1] << 8; + } else { + if (WARN_ON_ONCE(!bio_next_segment(bio, &iter_all))) { + res = -EIO; + goto out_free_bio; + } + data = page_address(bvec->bv_page) + bvec->bv_offset; + length |= data[0] << 8; + } + bio_free_pages(bio); + bio_put(bio); - bytes = msblk->devblksize - offset; compressed = SQUASHFS_COMPRESSED(length); length = SQUASHFS_COMPRESSED_SIZE(length); - if (next_index) - *next_index = index + length + 2; + index += 2; TRACE("Block @ 0x%llx, %scompressed size %d\n", index, - compressed ? "" : "un", length); - - if (length < 0 || length > output->length || - (index + length) > msblk->bytes_used) - goto block_release; - - for (; bytes < length; b++) { - bh[b] = sb_getblk(sb, ++cur_index); - if (bh[b] == NULL) - goto block_release; - bytes += msblk->devblksize; - } - ll_rw_block(REQ_OP_READ, 0, b - 1, bh + 1); + compressed ? "" : "un", length); } + if (next_index) + *next_index = index + length; - for (i = 0; i < b; i++) { - wait_on_buffer(bh[i]); - if (!buffer_uptodate(bh[i])) - goto block_release; - } + res = squashfs_bio_read(sb, index, length, &bio, &offset); + if (res) + goto out; if (compressed) { - if (!msblk->stream) - goto read_failure; - length = squashfs_decompress(msblk, bh, b, offset, length, - output); - if (length < 0) - goto read_failure; - } else { - /* - * Block is uncompressed. - */ - int in, pg_offset = 0; - void *data = squashfs_first_page(output); - - for (bytes = length; k < b; k++) { - in = min(bytes, msblk->devblksize - offset); - bytes -= in; - while (in) { - if (pg_offset == PAGE_SIZE) { - data = squashfs_next_page(output); - pg_offset = 0; - } - avail = min_t(int, in, PAGE_SIZE - - pg_offset); - memcpy(data + pg_offset, bh[k]->b_data + offset, - avail); - in -= avail; - pg_offset += avail; - offset += avail; - } - offset = 0; - put_bh(bh[k]); + if (!msblk->stream) { + res = -EIO; + goto out_free_bio; } - squashfs_finish_page(output); + res = squashfs_decompress(msblk, bio, offset, length, output); + } else { + res = copy_bio_to_actor(bio, output, offset, length); } - kfree(bh); - return length; - -block_release: - for (; k < b; k++) - put_bh(bh[k]); +out_free_bio: + bio_free_pages(bio); + bio_put(bio); +out: + if (res < 0) + ERROR("Failed to read block 0x%llx: %d\n", index, res); -read_failure: - ERROR("squashfs_read_data failed to read block 0x%llx\n", - (unsigned long long) index); - kfree(bh); - return -EIO; + return res; } diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index ec8617523e56..1b9ccfd0aa51 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -10,13 +10,14 @@ * decompressor.h */ +#include <linux/bio.h> + struct squashfs_decompressor { void *(*init)(struct squashfs_sb_info *, void *); void *(*comp_opts)(struct squashfs_sb_info *, void *, int); void (*free)(void *); int (*decompress)(struct squashfs_sb_info *, void *, - struct buffer_head **, int, int, int, - struct squashfs_page_actor *); + struct bio *, int, int, struct squashfs_page_actor *); int id; char *name; int supported; diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c index c181dee235bb..db9f12a3ea05 100644 --- a/fs/squashfs/decompressor_multi.c +++ b/fs/squashfs/decompressor_multi.c @@ -6,7 +6,7 @@ #include <linux/types.h> #include <linux/mutex.h> #include <linux/slab.h> -#include <linux/buffer_head.h> +#include <linux/bio.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/cpumask.h> @@ -180,14 +180,15 @@ wait: } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, - int b, int offset, int length, struct squashfs_page_actor *output) +int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, + int offset, int length, + struct squashfs_page_actor *output) { int res; struct squashfs_stream *stream = msblk->stream; struct decomp_stream *decomp_stream = get_decomp_stream(msblk, stream); res = msblk->decompressor->decompress(msblk, decomp_stream->stream, - bh, b, offset, length, output); + bio, offset, length, output); put_decomp_stream(decomp_stream, stream); if (res < 0) ERROR("%s decompression failed, data probably corrupt\n", diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c index 2a2a2d106440..b881b9283b7f 100644 --- a/fs/squashfs/decompressor_multi_percpu.c +++ b/fs/squashfs/decompressor_multi_percpu.c @@ -8,6 +8,7 @@ #include <linux/slab.h> #include <linux/percpu.h> #include <linux/buffer_head.h> +#include <linux/local_lock.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -20,7 +21,8 @@ */ struct squashfs_stream { - void *stream; + void *stream; + local_lock_t lock; }; void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, @@ -41,6 +43,7 @@ void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, err = PTR_ERR(stream->stream); goto out; } + local_lock_init(&stream->lock); } kfree(comp_opts); @@ -72,15 +75,19 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, - int b, int offset, int length, struct squashfs_page_actor *output) +int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, + int offset, int length, struct squashfs_page_actor *output) { - struct squashfs_stream __percpu *percpu = - (struct squashfs_stream __percpu *) msblk->stream; - struct squashfs_stream *stream = get_cpu_ptr(percpu); - int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, - offset, length, output); - put_cpu_ptr(stream); + struct squashfs_stream *stream; + int res; + + local_lock(&msblk->stream->lock); + stream = this_cpu_ptr(msblk->stream); + + res = msblk->decompressor->decompress(msblk, stream->stream, bio, + offset, length, output); + + local_unlock(&msblk->stream->lock); if (res < 0) ERROR("%s decompression failed, data probably corrupt\n", diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c index 550c3e592032..4eb3d083d45e 100644 --- a/fs/squashfs/decompressor_single.c +++ b/fs/squashfs/decompressor_single.c @@ -7,7 +7,7 @@ #include <linux/types.h> #include <linux/mutex.h> #include <linux/slab.h> -#include <linux/buffer_head.h> +#include <linux/bio.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -59,14 +59,15 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) } } -int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, - int b, int offset, int length, struct squashfs_page_actor *output) +int squashfs_decompress(struct squashfs_sb_info *msblk, struct bio *bio, + int offset, int length, + struct squashfs_page_actor *output) { int res; struct squashfs_stream *stream = msblk->stream; mutex_lock(&stream->mutex); - res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, + res = msblk->decompressor->decompress(msblk, stream->stream, bio, offset, length, output); mutex_unlock(&stream->mutex); diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c index c4e47e0588c7..233d5582fbee 100644 --- a/fs/squashfs/lz4_wrapper.c +++ b/fs/squashfs/lz4_wrapper.c @@ -4,7 +4,7 @@ * Phillip Lougher <phillip@squashfs.org.uk> */ -#include <linux/buffer_head.h> +#include <linux/bio.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -89,20 +89,23 @@ static void lz4_free(void *strm) static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); struct squashfs_lz4 *stream = strm; void *buff = stream->input, *data; - int avail, i, bytes = length, res; + int bytes = length, res; - for (i = 0; i < b; i++) { - avail = min(bytes, msblk->devblksize - offset); - memcpy(buff, bh[i]->b_data + offset, avail); + while (bio_next_segment(bio, &iter_all)) { + int avail = min(bytes, ((int)bvec->bv_len) - offset); + + data = page_address(bvec->bv_page) + bvec->bv_offset; + memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; offset = 0; - put_bh(bh[i]); } res = LZ4_decompress_safe(stream->input, stream->output, diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index aa3c3dafc33d..97bb7d92ddcd 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -9,7 +9,7 @@ */ #include <linux/mutex.h> -#include <linux/buffer_head.h> +#include <linux/bio.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/lzo.h> @@ -63,21 +63,24 @@ static void lzo_free(void *strm) static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); struct squashfs_lzo *stream = strm; void *buff = stream->input, *data; - int avail, i, bytes = length, res; + int bytes = length, res; size_t out_len = output->length; - for (i = 0; i < b; i++) { - avail = min(bytes, msblk->devblksize - offset); - memcpy(buff, bh[i]->b_data + offset, avail); + while (bio_next_segment(bio, &iter_all)) { + int avail = min(bytes, ((int)bvec->bv_len) - offset); + + data = page_address(bvec->bv_page) + bvec->bv_offset; + memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; offset = 0; - put_bh(bh[i]); } res = lzo1x_decompress_safe(stream->input, (size_t)length, diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 2797763ed046..9783e01c8100 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -40,8 +40,8 @@ extern void *squashfs_decompressor_setup(struct super_block *, unsigned short); /* decompressor_xxx.c */ extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *); extern void squashfs_decompressor_destroy(struct squashfs_sb_info *); -extern int squashfs_decompress(struct squashfs_sb_info *, struct buffer_head **, - int, int, int, struct squashfs_page_actor *); +extern int squashfs_decompress(struct squashfs_sb_info *, struct bio *, + int, int, struct squashfs_page_actor *); extern int squashfs_max_decompressors(void); /* export.c */ diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index 4b2f2051a6dc..e80419aed862 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -10,7 +10,7 @@ #include <linux/mutex.h> -#include <linux/buffer_head.h> +#include <linux/bio.h> #include <linux/slab.h> #include <linux/xz.h> #include <linux/bitops.h> @@ -117,11 +117,12 @@ static void squashfs_xz_free(void *strm) static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { - enum xz_ret xz_err; - int avail, total = 0, k = 0; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); + int total = 0, error = 0; struct squashfs_xz *stream = strm; xz_dec_reset(stream->state); @@ -131,11 +132,23 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, stream->buf.out_size = PAGE_SIZE; stream->buf.out = squashfs_first_page(output); - do { - if (stream->buf.in_pos == stream->buf.in_size && k < b) { - avail = min(length, msblk->devblksize - offset); + for (;;) { + enum xz_ret xz_err; + + if (stream->buf.in_pos == stream->buf.in_size) { + const void *data; + int avail; + + if (!bio_next_segment(bio, &iter_all)) { + /* XZ_STREAM_END must be reached. */ + error = -EIO; + break; + } + + avail = min(length, ((int)bvec->bv_len) - offset); + data = page_address(bvec->bv_page) + bvec->bv_offset; length -= avail; - stream->buf.in = bh[k]->b_data + offset; + stream->buf.in = data + offset; stream->buf.in_size = avail; stream->buf.in_pos = 0; offset = 0; @@ -150,23 +163,17 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, } xz_err = xz_dec_run(stream->state, &stream->buf); - - if (stream->buf.in_pos == stream->buf.in_size && k < b) - put_bh(bh[k++]); - } while (xz_err == XZ_OK); + if (xz_err == XZ_STREAM_END) + break; + if (xz_err != XZ_OK) { + error = -EIO; + break; + } + } squashfs_finish_page(output); - if (xz_err != XZ_STREAM_END || k < b) - goto out; - - return total + stream->buf.out_pos; - -out: - for (; k < b; k++) - put_bh(bh[k]); - - return -EIO; + return error ? error : total + stream->buf.out_pos; } const struct squashfs_decompressor squashfs_xz_comp_ops = { diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index f2226afa1625..bcb881ec47f2 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -10,7 +10,7 @@ #include <linux/mutex.h> -#include <linux/buffer_head.h> +#include <linux/bio.h> #include <linux/slab.h> #include <linux/zlib.h> #include <linux/vmalloc.h> @@ -50,21 +50,35 @@ static void zlib_free(void *strm) static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { - int zlib_err, zlib_init = 0, k = 0; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); + int zlib_init = 0, error = 0; z_stream *stream = strm; stream->avail_out = PAGE_SIZE; stream->next_out = squashfs_first_page(output); stream->avail_in = 0; - do { - if (stream->avail_in == 0 && k < b) { - int avail = min(length, msblk->devblksize - offset); + for (;;) { + int zlib_err; + + if (stream->avail_in == 0) { + const void *data; + int avail; + + if (!bio_next_segment(bio, &iter_all)) { + /* Z_STREAM_END must be reached. */ + error = -EIO; + break; + } + + avail = min(length, ((int)bvec->bv_len) - offset); + data = page_address(bvec->bv_page) + bvec->bv_offset; length -= avail; - stream->next_in = bh[k]->b_data + offset; + stream->next_in = data + offset; stream->avail_in = avail; offset = 0; } @@ -78,37 +92,28 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, if (!zlib_init) { zlib_err = zlib_inflateInit(stream); if (zlib_err != Z_OK) { - squashfs_finish_page(output); - goto out; + error = -EIO; + break; } zlib_init = 1; } zlib_err = zlib_inflate(stream, Z_SYNC_FLUSH); - - if (stream->avail_in == 0 && k < b) - put_bh(bh[k++]); - } while (zlib_err == Z_OK); + if (zlib_err == Z_STREAM_END) + break; + if (zlib_err != Z_OK) { + error = -EIO; + break; + } + } squashfs_finish_page(output); - if (zlib_err != Z_STREAM_END) - goto out; - - zlib_err = zlib_inflateEnd(stream); - if (zlib_err != Z_OK) - goto out; - - if (k < b) - goto out; - - return stream->total_out; - -out: - for (; k < b; k++) - put_bh(bh[k]); + if (!error) + if (zlib_inflateEnd(stream) != Z_OK) + error = -EIO; - return -EIO; + return error ? error : stream->total_out; } const struct squashfs_decompressor squashfs_zlib_comp_ops = { diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c index b448c2a1d0ed..b7cb1faa652d 100644 --- a/fs/squashfs/zstd_wrapper.c +++ b/fs/squashfs/zstd_wrapper.c @@ -9,7 +9,7 @@ */ #include <linux/mutex.h> -#include <linux/buffer_head.h> +#include <linux/bio.h> #include <linux/slab.h> #include <linux/zstd.h> #include <linux/vmalloc.h> @@ -59,33 +59,44 @@ static void zstd_free(void *strm) static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, - struct buffer_head **bh, int b, int offset, int length, + struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { struct workspace *wksp = strm; ZSTD_DStream *stream; size_t total_out = 0; - size_t zstd_err; - int k = 0; + int error = 0; ZSTD_inBuffer in_buf = { NULL, 0, 0 }; ZSTD_outBuffer out_buf = { NULL, 0, 0 }; + struct bvec_iter_all iter_all = {}; + struct bio_vec *bvec = bvec_init_iter_all(&iter_all); stream = ZSTD_initDStream(wksp->window_size, wksp->mem, wksp->mem_size); if (!stream) { ERROR("Failed to initialize zstd decompressor\n"); - goto out; + return -EIO; } out_buf.size = PAGE_SIZE; out_buf.dst = squashfs_first_page(output); - do { - if (in_buf.pos == in_buf.size && k < b) { - int avail = min(length, msblk->devblksize - offset); + for (;;) { + size_t zstd_err; + if (in_buf.pos == in_buf.size) { + const void *data; + int avail; + + if (!bio_next_segment(bio, &iter_all)) { + error = -EIO; + break; + } + + avail = min(length, ((int)bvec->bv_len) - offset); + data = page_address(bvec->bv_page) + bvec->bv_offset; length -= avail; - in_buf.src = bh[k]->b_data + offset; + in_buf.src = data + offset; in_buf.size = avail; in_buf.pos = 0; offset = 0; @@ -97,8 +108,8 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, /* Shouldn't run out of pages * before stream is done. */ - squashfs_finish_page(output); - goto out; + error = -EIO; + break; } out_buf.pos = 0; out_buf.size = PAGE_SIZE; @@ -107,29 +118,20 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, total_out -= out_buf.pos; zstd_err = ZSTD_decompressStream(stream, &out_buf, &in_buf); total_out += out_buf.pos; /* add the additional data produced */ - - if (in_buf.pos == in_buf.size && k < b) - put_bh(bh[k++]); - } while (zstd_err != 0 && !ZSTD_isError(zstd_err)); - - squashfs_finish_page(output); - - if (ZSTD_isError(zstd_err)) { - ERROR("zstd decompression error: %d\n", - (int)ZSTD_getErrorCode(zstd_err)); - goto out; + if (zstd_err == 0) + break; + + if (ZSTD_isError(zstd_err)) { + ERROR("zstd decompression error: %d\n", + (int)ZSTD_getErrorCode(zstd_err)); + error = -EIO; + break; + } } - if (k < b) - goto out; - - return (int)total_out; - -out: - for (; k < b; k++) - put_bh(bh[k]); + squashfs_finish_page(output); - return -EIO; + return error ? error : total_out; } const struct squashfs_decompressor squashfs_zstd_comp_ops = { diff --git a/fs/stat.c b/fs/stat.c index 030008796479..44f8ad346db4 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -22,6 +22,7 @@ #include <asm/unistd.h> #include "internal.h" +#include "mount.h" /** * generic_fillattr - Fill in the basic attributes from the inode struct @@ -70,15 +71,18 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, memset(stat, 0, sizeof(*stat)); stat->result_mask |= STATX_BASIC_STATS; - request_mask &= STATX_ALL; query_flags &= KSTAT_QUERY_FLAGS; /* allow the fs to override these if it really wants to */ - if (IS_NOATIME(inode)) + /* SB_NOATIME means filesystem supplies dummy atime value */ + if (inode->i_sb->s_flags & SB_NOATIME) stat->result_mask &= ~STATX_ATIME; if (IS_AUTOMOUNT(inode)) stat->attributes |= STATX_ATTR_AUTOMOUNT; + if (IS_DAX(inode)) + stat->attributes |= STATX_ATTR_DAX; + if (inode->i_op->getattr) return inode->i_op->getattr(path, stat, request_mask, query_flags); @@ -152,7 +156,8 @@ int vfs_statx_fd(unsigned int fd, struct kstat *stat, } EXPORT_SYMBOL(vfs_statx_fd); -inline unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags) +static inline unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, + int flags) { if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0) @@ -199,6 +204,11 @@ retry: goto out; error = vfs_getattr(&path, stat, request_mask, flags); + stat->mnt_id = real_mount(path.mnt)->mnt_id; + stat->result_mask |= STATX_MNT_ID; + if (path.mnt->mnt_root == path.dentry) + stat->attributes |= STATX_ATTR_MOUNT_ROOT; + stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -533,7 +543,7 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename, } #endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */ -noinline_for_stack int +static noinline_for_stack int cp_statx(const struct kstat *stat, struct statx __user *buffer) { struct statx tmp; @@ -563,10 +573,29 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) tmp.stx_rdev_minor = MINOR(stat->rdev); tmp.stx_dev_major = MAJOR(stat->dev); tmp.stx_dev_minor = MINOR(stat->dev); + tmp.stx_mnt_id = stat->mnt_id; return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; } +int do_statx(int dfd, const char __user *filename, unsigned flags, + unsigned int mask, struct statx __user *buffer) +{ + struct kstat stat; + int error; + + if (mask & STATX__RESERVED) + return -EINVAL; + if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) + return -EINVAL; + + error = vfs_statx(dfd, filename, flags, &stat, mask); + if (error) + return error; + + return cp_statx(&stat, buffer); +} + /** * sys_statx - System call to get enhanced stats * @dfd: Base directory to pathwalk from *or* fd to stat. @@ -583,19 +612,7 @@ SYSCALL_DEFINE5(statx, unsigned int, mask, struct statx __user *, buffer) { - struct kstat stat; - int error; - - if (mask & STATX__RESERVED) - return -EINVAL; - if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) - return -EINVAL; - - error = vfs_statx(dfd, filename, flags, &stat, mask); - if (error) - return error; - - return cp_statx(&stat, buffer); + return do_statx(dfd, filename, flags, mask, buffer); } #ifdef CONFIG_COMPAT diff --git a/fs/super.c b/fs/super.c index a288cd60d2ae..bf3b7685b52a 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1598,12 +1598,10 @@ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) int err; va_list args; - bdi = bdi_alloc(GFP_KERNEL); + bdi = bdi_alloc(NUMA_NO_NODE); if (!bdi) return -ENOMEM; - bdi->name = sb->s_type->name; - va_start(args, fmt); err = bdi_register_va(bdi, fmt, args); va_end(args); diff --git a/fs/sync.c b/fs/sync.c index 4d1ff010bc5a..c6f6f5be5682 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -161,7 +161,7 @@ SYSCALL_DEFINE1(syncfs, int, fd) { struct fd f = fdget(fd); struct super_block *sb; - int ret; + int ret, ret2; if (!f.file) return -EBADF; @@ -171,8 +171,10 @@ SYSCALL_DEFINE1(syncfs, int, fd) ret = sync_filesystem(sb); up_read(&sb->s_umount); + ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err); + fdput(f); - return ret; + return ret ? ret : ret2; } /** diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index aa85f2874a9f..59dffd5ca517 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #define pr_fmt(fmt) "sysfs: " fmt diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 26bbf960e2a2..f275fcda62fb 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #include <linux/module.h> diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index db81cfbab9d6..e747c135c1d1 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #include <linux/fs.h> diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index c4deecc80f67..5603530a1a52 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -6,7 +6,7 @@ * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * - * Please see Documentation/filesystems/sysfs.txt for more information. + * Please see Documentation/filesystems/sysfs.rst for more information. */ #include <linux/fs.h> diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig index d4edf7d9ae10..b4e23e03fbeb 100644 --- a/fs/sysv/Kconfig +++ b/fs/sysv/Kconfig @@ -28,7 +28,7 @@ config SYSV_FS tar" or preferably "info tar"). Note also that this option has nothing whatsoever to do with the option "System V IPC". Read about the System V file system in - <file:Documentation/filesystems/sysv-fs.txt>. + <file:Documentation/filesystems/sysv-fs.rst>. Saying Y here will enlarge your kernel by about 27 KB. To compile this as a module, choose M here: the module will be called diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c index f985a3fbbb36..cc5c0abfd536 100644 --- a/fs/ubifs/auth.c +++ b/fs/ubifs/auth.c @@ -31,15 +31,9 @@ int __ubifs_node_calc_hash(const struct ubifs_info *c, const void *node, u8 *hash) { const struct ubifs_ch *ch = node; - SHASH_DESC_ON_STACK(shash, c->hash_tfm); - int err; - - shash->tfm = c->hash_tfm; - err = crypto_shash_digest(shash, node, le32_to_cpu(ch->len), hash); - if (err < 0) - return err; - return 0; + return crypto_shash_tfm_digest(c->hash_tfm, node, le32_to_cpu(ch->len), + hash); } /** @@ -53,15 +47,7 @@ int __ubifs_node_calc_hash(const struct ubifs_info *c, const void *node, static int ubifs_hash_calc_hmac(const struct ubifs_info *c, const u8 *hash, u8 *hmac) { - SHASH_DESC_ON_STACK(shash, c->hmac_tfm); - int err; - - shash->tfm = c->hmac_tfm; - - err = crypto_shash_digest(shash, hash, c->hash_len, hmac); - if (err < 0) - return err; - return 0; + return crypto_shash_tfm_digest(c->hmac_tfm, hash, c->hash_len, hmac); } /** diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 0f5a480fe264..31288d8fa2ce 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -815,7 +815,7 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum) pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory for dumping LEB %d", lnum); return; diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 29826c51883a..22bfda158f7f 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1095,7 +1095,7 @@ static int scan_check_cb(struct ubifs_info *c, return LPT_SCAN_CONTINUE; } - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) return -ENOMEM; diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index ff5e0411cf2d..d76a19e460cd 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1596,7 +1596,7 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) if (!dbg_is_chk_lprops(c)) return 0; - buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = p = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory for ltab checking"); return 0; @@ -1845,7 +1845,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) void *buf, *p; pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); - buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = p = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory to dump LPT"); return; diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index 52a85c01397e..911d0555b9f2 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -68,12 +68,9 @@ static int mst_node_check_hash(const struct ubifs_info *c, u8 calc[UBIFS_MAX_HASH_LEN]; const void *node = mst; - SHASH_DESC_ON_STACK(shash, c->hash_tfm); - - shash->tfm = c->hash_tfm; - - crypto_shash_digest(shash, node + sizeof(struct ubifs_ch), - UBIFS_MST_NODE_SZ - sizeof(struct ubifs_ch), calc); + crypto_shash_tfm_digest(c->hash_tfm, node + sizeof(struct ubifs_ch), + UBIFS_MST_NODE_SZ - sizeof(struct ubifs_ch), + calc); if (ubifs_check_hash(c, expected, calc)) return -EPERM; diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 283f9eb48410..2c294085ffed 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -977,7 +977,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) if (c->no_orphs) return 0; - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory to check orphans"); return 0; diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 01fcf7975047..b69ffac7e415 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -558,7 +558,7 @@ static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud) return data == 0xFFFFFFFF; } -/* authenticate_sleb_hash and authenticate_sleb_hmac are split out for stack usage */ +/* authenticate_sleb_hash is split out for stack usage */ static int authenticate_sleb_hash(struct ubifs_info *c, struct shash_desc *log_hash, u8 *hash) { SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm); @@ -569,15 +569,6 @@ static int authenticate_sleb_hash(struct ubifs_info *c, struct shash_desc *log_h return crypto_shash_final(hash_desc, hash); } -static int authenticate_sleb_hmac(struct ubifs_info *c, u8 *hash, u8 *hmac) -{ - SHASH_DESC_ON_STACK(hmac_desc, c->hmac_tfm); - - hmac_desc->tfm = c->hmac_tfm; - - return crypto_shash_digest(hmac_desc, hash, c->hash_len, hmac); -} - /** * authenticate_sleb - authenticate one scan LEB * @c: UBIFS file-system description object @@ -618,7 +609,8 @@ static int authenticate_sleb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, if (err) goto out; - err = authenticate_sleb_hmac(c, hash, hmac); + err = crypto_shash_tfm_digest(c->hmac_tfm, hash, + c->hash_len, hmac); if (err) goto out; diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig index 6848de581ce1..26e1a49f3ba7 100644 --- a/fs/udf/Kconfig +++ b/fs/udf/Kconfig @@ -9,7 +9,7 @@ config UDF_FS compatible with standard unix file systems, it is also suitable for removable USB disks. Say Y if you intend to mount DVD discs or CDRW's written in packet mode, or if you want to use UDF for removable USB - disks. Please read <file:Documentation/filesystems/udf.txt>. + disks. Please read <file:Documentation/filesystems/udf.rst>. To compile this file system support as a module, choose M here: the module will be called udf. diff --git a/fs/udf/inode.c b/fs/udf/inode.c index e875bc5668ee..adaba8e8b326 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -195,10 +195,9 @@ static int udf_readpage(struct file *file, struct page *page) return mpage_readpage(page, udf_get_block); } -static int udf_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void udf_readahead(struct readahead_control *rac) { - return mpage_readpages(mapping, pages, nr_pages, udf_get_block); + mpage_readahead(rac, udf_get_block); } static int udf_write_begin(struct file *file, struct address_space *mapping, @@ -234,7 +233,7 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block) const struct address_space_operations udf_aops = { .readpage = udf_readpage, - .readpages = udf_readpages, + .readahead = udf_readahead, .writepage = udf_writepage, .writepages = udf_writepages, .write_begin = udf_write_begin, diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index 5c7ec121990d..f1094cdcd6cd 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -27,41 +27,38 @@ unsigned int udf_get_last_session(struct super_block *sb) { + struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); struct cdrom_multisession ms_info; - unsigned int vol_desc_start; - struct block_device *bdev = sb->s_bdev; - int i; - vol_desc_start = 0; - ms_info.addr_format = CDROM_LBA; - i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info); + if (!cdi) { + udf_debug("CDROMMULTISESSION not supported.\n"); + return 0; + } - if (i == 0) { + ms_info.addr_format = CDROM_LBA; + if (cdrom_multisession(cdi, &ms_info) == 0) { udf_debug("XA disk: %s, vol_desc_start=%d\n", ms_info.xa_flag ? "yes" : "no", ms_info.addr.lba); if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ - vol_desc_start = ms_info.addr.lba; - } else { - udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i); + return ms_info.addr.lba; } - return vol_desc_start; + return 0; } unsigned long udf_get_last_block(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; + struct cdrom_device_info *cdi = disk_to_cdi(bdev->bd_disk); unsigned long lblock = 0; /* - * ioctl failed or returned obviously bogus value? + * The cdrom layer call failed or returned obviously bogus value? * Try using the device size... */ - if (ioctl_by_bdev(bdev, CDROM_LAST_WRITTEN, (unsigned long) &lblock) || - lblock == 0) + if (!cdi || cdrom_get_last_written(cdi, &lblock) || lblock == 0) lblock = i_size_read(bdev->bd_inode) >> sb->s_blocksize_bits; if (lblock) return lblock - 1; - else - return 0; + return 0; } diff --git a/fs/utimes.c b/fs/utimes.c index 1d17ce98cb80..b7b927502d6e 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -95,13 +95,13 @@ long do_utimes(int dfd, const char __user *filename, struct timespec64 *times, goto out; } - if (flags & ~AT_SYMLINK_NOFOLLOW) + if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) goto out; if (filename == NULL && dfd != AT_FDCWD) { struct fd f; - if (flags & AT_SYMLINK_NOFOLLOW) + if (flags) goto out; f = fdget(dfd); @@ -117,6 +117,8 @@ long do_utimes(int dfd, const char __user *filename, struct timespec64 *times, if (!(flags & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (error) diff --git a/fs/verity/enable.c b/fs/verity/enable.c index d98bea308fd7..5ab3bbec8108 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -329,6 +329,8 @@ rollback: /** * fsverity_ioctl_enable() - enable verity on a file + * @filp: file to enable verity on + * @uarg: user pointer to fsverity_enable_arg * * Enable fs-verity on a file. See the "FS_IOC_ENABLE_VERITY" section of * Documentation/filesystems/fsverity.rst for the documentation. diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 74768cf539da..e96d99d5145e 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -61,7 +61,7 @@ struct merkle_tree_params { u64 level_start[FS_VERITY_MAX_LEVELS]; }; -/** +/* * fsverity_info - cached verity metadata for an inode * * When a verity file is first opened, an instance of this struct is allocated @@ -134,7 +134,7 @@ void __init fsverity_check_hash_algs(void); /* init.c */ -extern void __printf(3, 4) __cold +void __printf(3, 4) __cold fsverity_msg(const struct inode *inode, const char *level, const char *fmt, ...); diff --git a/fs/verity/measure.c b/fs/verity/measure.c index 05049b68c745..df409a5682ed 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -11,6 +11,8 @@ /** * fsverity_ioctl_measure() - get a verity file's measurement + * @filp: file to get measurement of + * @_uarg: user pointer to fsverity_digest * * Retrieve the file measurement that the kernel is enforcing for reads from a * verity file. See the "FS_IOC_MEASURE_VERITY" section of diff --git a/fs/verity/open.c b/fs/verity/open.c index c5fe6948e262..d007db0c9304 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -330,6 +330,7 @@ EXPORT_SYMBOL_GPL(fsverity_prepare_setattr); /** * fsverity_cleanup_inode() - free the inode's verity info, if present + * @inode: an inode being evicted * * Filesystems must call this on inode eviction to free ->i_verity_info. */ diff --git a/fs/verity/signature.c b/fs/verity/signature.c index c8b255232de5..b14ed96387ec 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -28,6 +28,9 @@ static struct key *fsverity_keyring; /** * fsverity_verify_signature() - check a verity file's signature + * @vi: the file's fsverity_info + * @desc: the file's fsverity_descriptor + * @desc_size: size of @desc * * If the file's fs-verity descriptor includes a signature of the file * measurement, verify it against the certificates in the fs-verity keyring. diff --git a/fs/verity/verify.c b/fs/verity/verify.c index e0cb62da3864..a8b68c6f663d 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -179,6 +179,7 @@ out: /** * fsverity_verify_page() - verify a data page + * @page: the page to verity * * Verify a page that has just been read from a verity file. The page must be a * pagecache page that is still locked and not yet uptodate. @@ -206,6 +207,7 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page); #ifdef CONFIG_BLOCK /** * fsverity_verify_bio() - verify a 'read' bio that has just completed + * @bio: the bio to verify * * Verify a set of pages that have just been read from a verity file. The pages * must be pagecache pages that are still locked and not yet uptodate. Pages @@ -264,6 +266,7 @@ EXPORT_SYMBOL_GPL(fsverity_verify_bio); /** * fsverity_enqueue_verify_work() - enqueue work on the fs-verity workqueue + * @work: the work to enqueue * * Enqueue verification work for asynchronous processing. */ diff --git a/fs/xattr.c b/fs/xattr.c index e13265e65871..91608d9bfc6a 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -876,6 +876,9 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, struct simple_xattr *new_xattr = NULL; int err = 0; + if (removed_size) + *removed_size = -1; + /* value == NULL means remove */ if (value) { new_xattr = simple_xattr_alloc(value, size); @@ -914,9 +917,6 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, list_add(&new_xattr->list, &xattrs->head); xattr = NULL; } - - if (removed_size) - *removed_size = -1; out: spin_unlock(&xattrs->lock); if (xattr) { diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 4f95df476181..04611a1068b4 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -7,8 +7,6 @@ ccflags-y += -I $(srctree)/$(src) # needed for trace events ccflags-y += -I $(srctree)/$(src)/libxfs -ccflags-$(CONFIG_XFS_DEBUG) += -g - obj-$(CONFIG_XFS_FS) += xfs.o # this one should be compiled first, as the tracing macros can easily blow up @@ -101,9 +99,12 @@ xfs-y += xfs_log.o \ xfs_log_cil.o \ xfs_bmap_item.o \ xfs_buf_item.o \ + xfs_buf_item_recover.o \ + xfs_dquot_item_recover.o \ xfs_extfree_item.o \ xfs_icreate_item.o \ xfs_inode_item.o \ + xfs_inode_item_recover.o \ xfs_refcount_item.o \ xfs_rmap_item.o \ xfs_log_recover.o \ diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 1da94237a8cf..f1366475c389 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -48,7 +48,7 @@ __kmem_vmalloc(size_t size, xfs_km_flags_t flags) if (flags & KM_NOFS) nofs_flag = memalloc_nofs_save(); - ptr = __vmalloc(size, lflags, PAGE_KERNEL); + ptr = __vmalloc(size, lflags); if (flags & KM_NOFS) memalloc_nofs_restore(nofs_flag); diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 6143117770e9..34cbcfde9228 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -19,6 +19,7 @@ typedef unsigned __bitwise xfs_km_flags_t; #define KM_NOFS ((__force xfs_km_flags_t)0x0004u) #define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) #define KM_ZERO ((__force xfs_km_flags_t)0x0010u) +#define KM_NOLOCKDEP ((__force xfs_km_flags_t)0x0020u) /* * We use a special process flag to avoid recursive callbacks into @@ -30,7 +31,7 @@ kmem_flags_convert(xfs_km_flags_t flags) { gfp_t lflags; - BUG_ON(flags & ~(KM_NOFS|KM_MAYFAIL|KM_ZERO)); + BUG_ON(flags & ~(KM_NOFS | KM_MAYFAIL | KM_ZERO | KM_NOLOCKDEP)); lflags = GFP_KERNEL | __GFP_NOWARN; if (flags & KM_NOFS) @@ -49,6 +50,9 @@ kmem_flags_convert(xfs_km_flags_t flags) if (flags & KM_ZERO) lflags |= __GFP_ZERO; + if (flags & KM_NOLOCKDEP) + lflags |= __GFP_NOLOCKDEP; + return lflags; } diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h index c0352edc8e41..f3fd0ee9a7f7 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.h +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Copyright (C) 2016 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index a851bf77f17b..6c22b12176b8 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index 047f09f0be3c..a5b998e950fe 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index e4fe3dca9883..3b1bd6e112f8 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -61,8 +61,8 @@ xfs_inode_hasattr( struct xfs_inode *ip) { if (!XFS_IFORK_Q(ip) || - (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_anextents == 0)) + (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_afp->if_nextents == 0)) return 0; return 1; } @@ -84,7 +84,7 @@ xfs_attr_get_ilocked( if (!xfs_inode_hasattr(args->dp)) return -ENOATTR; - if (args->dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) + if (args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_getvalue(args); if (xfs_bmap_one_block(args->dp, XFS_ATTR_FORK)) return xfs_attr_leaf_get(args); @@ -212,14 +212,14 @@ xfs_attr_set_args( * If the attribute list is non-existent or a shortform list, * upgrade it to a single-leaf-block attribute list. */ - if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { + if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL || + (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && + dp->i_afp->if_nextents == 0)) { /* * Build initial attribute list (if required). */ - if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) + if (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS) xfs_attr_shortform_create(args); /* @@ -272,7 +272,7 @@ xfs_attr_remove_args( if (!xfs_inode_hasattr(dp)) { error = -ENOATTR; - } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + } else if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) { ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); error = xfs_attr_shortform_remove(args); } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 0d2d05908537..db4717657ca1 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 863444e2dda7..2f7e89e4be3e 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -309,14 +309,6 @@ xfs_attr3_leaf_verify( return fa; /* - * In recovery there is a transient state where count == 0 is valid - * because we may have transitioned an empty shortform attr to a leaf - * if the attr didn't fit in shortform. - */ - if (!xfs_log_in_recovery(mp) && ichdr.count == 0) - return __this_address; - - /* * firstused is the block offset of the first name info structure. * Make sure it doesn't go off the block or crash into the header. */ @@ -331,6 +323,13 @@ xfs_attr3_leaf_verify( (char *)bp->b_addr + ichdr.firstused) return __this_address; + /* + * NOTE: This verifier historically failed empty leaf buffers because + * we expect the fork to be in another format. Empty attr fork format + * conversions are possible during xattr set, however, and format + * conversion is not atomic with the xattr set that triggers it. We + * cannot assume leaf blocks are non-empty until that is addressed. + */ buf_end = (char *)bp->b_addr + mp->m_attr_geo->blksize; for (i = 0, ent = entries; i < ichdr.count; ent++, i++) { fa = xfs_attr3_leaf_verify_entry(mp, buf_end, leaf, &ichdr, @@ -489,7 +488,7 @@ xfs_attr_copy_value( } if (!args->value) { - args->value = kmem_alloc_large(valuelen, 0); + args->value = kmem_alloc_large(valuelen, KM_NOLOCKDEP); if (!args->value) return -ENOMEM; } @@ -539,7 +538,7 @@ xfs_attr_shortform_bytesfit( /* rounded down */ offset = (XFS_LITINO(mp) - bytes) >> 3; - if (dp->i_d.di_format == XFS_DINODE_FMT_DEV) { + if (dp->i_df.if_format == XFS_DINODE_FMT_DEV) { minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; return (offset >= minforkoff) ? minforkoff : 0; } @@ -567,7 +566,7 @@ xfs_attr_shortform_bytesfit( dsize = dp->i_df.if_bytes; - switch (dp->i_d.di_format) { + switch (dp->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: /* * If there is no attr fork and the data fork is extents, @@ -636,22 +635,19 @@ xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp) * Create the initial contents of a shortform attribute list. */ void -xfs_attr_shortform_create(xfs_da_args_t *args) +xfs_attr_shortform_create( + struct xfs_da_args *args) { - xfs_attr_sf_hdr_t *hdr; - xfs_inode_t *dp; - struct xfs_ifork *ifp; + struct xfs_inode *dp = args->dp; + struct xfs_ifork *ifp = dp->i_afp; + struct xfs_attr_sf_hdr *hdr; trace_xfs_attr_sf_create(args); - dp = args->dp; - ASSERT(dp != NULL); - ifp = dp->i_afp; - ASSERT(ifp != NULL); ASSERT(ifp->if_bytes == 0); - if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) { + if (ifp->if_format == XFS_DINODE_FMT_EXTENTS) { ifp->if_flags &= ~XFS_IFEXTENTS; /* just in case */ - dp->i_d.di_aformat = XFS_DINODE_FMT_LOCAL; + ifp->if_format = XFS_DINODE_FMT_LOCAL; ifp->if_flags |= XFS_IFINLINE; } else { ASSERT(ifp->if_flags & XFS_IFINLINE); @@ -719,13 +715,12 @@ xfs_attr_fork_remove( struct xfs_inode *ip, struct xfs_trans *tp) { - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - ip->i_d.di_forkoff = 0; - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - - ASSERT(ip->i_d.di_anextents == 0); - ASSERT(ip->i_afp == NULL); + ASSERT(ip->i_afp->if_nextents == 0); + xfs_idestroy_fork(ip->i_afp); + kmem_cache_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + ip->i_d.di_forkoff = 0; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -775,7 +770,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) totsize -= size; if (totsize == sizeof(xfs_attr_sf_hdr_t) && (mp->m_flags & XFS_MOUNT_ATTR2) && - (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && + (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & XFS_DA_OP_ADDNAME)) { xfs_attr_fork_remove(dp, args->trans); } else { @@ -785,7 +780,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || (args->op_flags & XFS_DA_OP_ADDNAME) || !(mp->m_flags & XFS_MOUNT_ATTR2) || - dp->i_d.di_format == XFS_DINODE_FMT_BTREE); + dp->i_df.if_format == XFS_DINODE_FMT_BTREE); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } @@ -962,7 +957,7 @@ xfs_attr_shortform_allfit( + be16_to_cpu(name_loc->valuelen); } if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && - (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && + (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) return -1; return xfs_attr_shortform_bytesfit(dp, bytes); @@ -981,7 +976,7 @@ xfs_attr_shortform_verify( int i; int64_t size; - ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL); + ASSERT(ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL); ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK); sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data; size = ifp->if_bytes; @@ -1085,7 +1080,7 @@ xfs_attr3_leaf_to_shortform( if (forkoff == -1) { ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); - ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); + ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); xfs_attr_fork_remove(dp, args->trans); goto out; } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 6dd2d937a42a..5be6be309302 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 6fb4572845ce..e1144f22b005 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index aafa4fe70624..bb004fb7944a 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h index 99017b8df292..a04f266ae644 100644 --- a/fs/xfs/libxfs/xfs_bit.h +++ b/fs/xfs/libxfs/xfs_bit.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index fda13cd7add0..667cdd0dfdf4 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -61,10 +61,10 @@ xfs_bmap_compute_maxlevels( int sz; /* root block size */ /* - * The maximum number of extents in a file, hence the maximum - * number of leaf entries, is controlled by the type of di_nextents - * (a signed 32-bit number, xfs_extnum_t), or by di_anextents - * (a signed 16-bit number, xfs_aextnum_t). + * The maximum number of extents in a file, hence the maximum number of + * leaf entries, is controlled by the size of the on-disk extent count, + * either a signed 32-bit number for the data fork, or a signed 16-bit + * number for the attr fork. * * Note that we can no longer assume that if we are in ATTR1 that * the fork offset of all the inodes will be @@ -120,10 +120,11 @@ xfs_bmbt_lookup_first( */ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) { + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + return whichfork != XFS_COW_FORK && - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) > - XFS_IFORK_MAXEXT(ip, whichfork); + ifp->if_format == XFS_DINODE_FMT_EXTENTS && + ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork); } /* @@ -131,10 +132,11 @@ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) */ static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) { + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + return whichfork != XFS_COW_FORK && - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && - XFS_IFORK_NEXTENTS(ip, whichfork) <= - XFS_IFORK_MAXEXT(ip, whichfork); + ifp->if_format == XFS_DINODE_FMT_BTREE && + ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork); } /* @@ -213,8 +215,8 @@ xfs_bmap_forkoff_reset( int whichfork) { if (whichfork == XFS_ATTR_FORK && - ip->i_d.di_format != XFS_DINODE_FMT_DEV && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { + ip->i_df.if_format != XFS_DINODE_FMT_DEV && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE) { uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; if (dfl_forkoff > ip->i_d.di_forkoff) @@ -315,31 +317,28 @@ xfs_bmap_check_leaf_extents( xfs_inode_t *ip, /* incore inode pointer */ int whichfork) /* data or attr fork */ { + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_btree_block *block; /* current btree block */ xfs_fsblock_t bno; /* block # of "block" */ xfs_buf_t *bp; /* buffer for "block" */ int error; /* error return value */ xfs_extnum_t i=0, j; /* index into the extents list */ - struct xfs_ifork *ifp; /* fork structure */ int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ __be64 *pp; /* pointer to block address */ xfs_bmbt_rec_t *ep; /* pointer to current extent */ xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */ xfs_bmbt_rec_t *nextp; /* pointer to next extent */ int bp_release = 0; - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { + if (ifp->if_format != XFS_DINODE_FMT_BTREE) return; - } /* skip large extent count inodes */ - if (ip->i_d.di_nextents > 10000) + if (ip->i_df.if_nextents > 10000) return; bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); block = ifp->if_broot; /* * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. @@ -604,7 +603,7 @@ xfs_bmap_btree_to_extents( ASSERT(cur); ASSERT(whichfork != XFS_COW_FORK); ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); ASSERT(be16_to_cpu(rblock->bb_level) == 1); ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); @@ -632,7 +631,7 @@ xfs_bmap_btree_to_extents( xfs_iroot_realloc(ip, -1, whichfork); ASSERT(ifp->if_broot == NULL); ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork); return 0; } @@ -668,7 +667,7 @@ xfs_bmap_extents_to_btree( mp = ip->i_mount; ASSERT(whichfork != XFS_COW_FORK); ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); + ASSERT(ifp->if_format == XFS_DINODE_FMT_EXTENTS); /* * Make space in the inode incore. This needs to be undone if we fail @@ -692,7 +691,7 @@ xfs_bmap_extents_to_btree( /* * Convert to a btree with two levels, one record in root. */ - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + ifp->if_format = XFS_DINODE_FMT_BTREE; memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = mp; @@ -750,7 +749,7 @@ xfs_bmap_extents_to_btree( xfs_bmbt_disk_set_all(arp, &rec); cnt++; } - ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); + ASSERT(cnt == ifp->if_nextents); xfs_btree_set_numrecs(ablock, cnt); /* @@ -778,7 +777,7 @@ out_unreserve_dquot: xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); out_root_realloc: xfs_iroot_realloc(ip, -1, whichfork); - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; ASSERT(ifp->if_broot == NULL); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); @@ -800,16 +799,16 @@ xfs_bmap_local_to_extents_empty( struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(whichfork != XFS_COW_FORK); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); ASSERT(ifp->if_bytes == 0); - ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); + ASSERT(ifp->if_nextents == 0); xfs_bmap_forkoff_reset(ip, whichfork); ifp->if_flags &= ~XFS_IFINLINE; ifp->if_flags |= XFS_IFEXTENTS; ifp->if_u1.if_root = NULL; ifp->if_height = 0; - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + ifp->if_format = XFS_DINODE_FMT_EXTENTS; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -840,7 +839,7 @@ xfs_bmap_local_to_extents( */ ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK)); ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); if (!ifp->if_bytes) { xfs_bmap_local_to_extents_empty(tp, ip, whichfork); @@ -907,7 +906,7 @@ xfs_bmap_local_to_extents( xfs_iext_first(ifp, &icur); xfs_iext_insert(ip, &icur, &rec, 0); - XFS_IFORK_NEXT_SET(ip, whichfork, 1); + ifp->if_nextents = 1; ip->i_d.di_nblocks = 1; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); @@ -972,7 +971,8 @@ xfs_bmap_add_attrfork_extents( xfs_btree_cur_t *cur; /* bmap btree cursor */ int error; /* error return value */ - if (ip->i_d.di_nextents * sizeof(xfs_bmbt_rec_t) <= XFS_IFORK_DSIZE(ip)) + if (ip->i_df.if_nextents * sizeof(struct xfs_bmbt_rec) <= + XFS_IFORK_DSIZE(ip)) return 0; cur = NULL; error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags, @@ -1033,7 +1033,7 @@ xfs_bmap_set_attrforkoff( int size, int *version) { - switch (ip->i_d.di_format) { + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_DEV: ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; break; @@ -1091,17 +1091,6 @@ xfs_bmap_add_attrfork( goto trans_cancel; if (XFS_IFORK_Q(ip)) goto trans_cancel; - if (XFS_IS_CORRUPT(mp, ip->i_d.di_anextents != 0)) { - error = -EFSCORRUPTED; - goto trans_cancel; - } - if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { - /* - * For inodes coming from pre-6.2 filesystems. - */ - ASSERT(ip->i_d.di_aformat == 0); - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - } xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -1110,9 +1099,10 @@ xfs_bmap_add_attrfork( goto trans_cancel; ASSERT(ip->i_afp == NULL); ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, 0); + ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS; ip->i_afp->if_flags = XFS_IFEXTENTS; logflags = 0; - switch (ip->i_d.di_format) { + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_LOCAL: error = xfs_bmap_add_attrfork_local(tp, ip, &logflags); break; @@ -1183,13 +1173,13 @@ xfs_iread_bmbt_block( xfs_extnum_t num_recs; xfs_extnum_t j; int whichfork = cur->bc_ino.whichfork; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); block = xfs_btree_get_block(cur, level, &bp); /* Abort if we find more records than nextents. */ num_recs = xfs_btree_get_numrecs(block); - if (unlikely(ir->loaded + num_recs > - XFS_IFORK_NEXTENTS(ip, whichfork))) { + if (unlikely(ir->loaded + num_recs > ifp->if_nextents)) { xfs_warn(ip->i_mount, "corrupt dinode %llu, (btree extents).", (unsigned long long)ip->i_ino); xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, block, @@ -1215,7 +1205,7 @@ xfs_iread_bmbt_block( xfs_bmap_fork_to_state(whichfork)); trace_xfs_read_extent(ip, &ir->icur, xfs_bmap_fork_to_state(whichfork), _THIS_IP_); - xfs_iext_next(XFS_IFORK_PTR(ip, whichfork), &ir->icur); + xfs_iext_next(ifp, &ir->icur); } return 0; @@ -1238,9 +1228,7 @@ xfs_iread_extents( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - if (XFS_IS_CORRUPT(mp, - XFS_IFORK_FORMAT(ip, whichfork) != - XFS_DINODE_FMT_BTREE)) { + if (XFS_IS_CORRUPT(mp, ifp->if_format != XFS_DINODE_FMT_BTREE)) { error = -EFSCORRUPTED; goto out; } @@ -1254,8 +1242,7 @@ xfs_iread_extents( if (error) goto out; - if (XFS_IS_CORRUPT(mp, - ir.loaded != XFS_IFORK_NEXTENTS(ip, whichfork))) { + if (XFS_IS_CORRUPT(mp, ir.loaded != ifp->if_nextents)) { error = -EFSCORRUPTED; goto out; } @@ -1289,14 +1276,13 @@ xfs_bmap_first_unused( xfs_fileoff_t lowest, max; int error; - ASSERT(xfs_ifork_has_extents(ip, whichfork) || - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); - - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + if (ifp->if_format == XFS_DINODE_FMT_LOCAL) { *first_unused = 0; return 0; } + ASSERT(xfs_ifork_has_extents(ifp)); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(tp, ip, whichfork); if (error) @@ -1337,7 +1323,7 @@ xfs_bmap_last_before( struct xfs_iext_cursor icur; int error; - switch (XFS_IFORK_FORMAT(ip, whichfork)) { + switch (ifp->if_format) { case XFS_DINODE_FMT_LOCAL: *last_block = 0; return 0; @@ -1436,16 +1422,17 @@ xfs_bmap_last_offset( xfs_fileoff_t *last_block, int whichfork) { + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_bmbt_irec rec; int is_empty; int error; *last_block = 0; - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) + if (ifp->if_format == XFS_DINODE_FMT_LOCAL) return 0; - if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ip, whichfork))) + if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp))) return -EFSCORRUPTED; error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); @@ -1463,23 +1450,22 @@ xfs_bmap_last_offset( */ int /* 1=>1 block, 0=>otherwise */ xfs_bmap_one_block( - xfs_inode_t *ip, /* incore inode */ - int whichfork) /* data or attr fork */ + struct xfs_inode *ip, /* incore inode */ + int whichfork) /* data or attr fork */ { - struct xfs_ifork *ifp; /* inode fork pointer */ - int rval; /* return value */ - xfs_bmbt_irec_t s; /* internal version of extent */ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int rval; /* return value */ + struct xfs_bmbt_irec s; /* internal version of extent */ struct xfs_iext_cursor icur; #ifndef DEBUG if (whichfork == XFS_DATA_FORK) return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; #endif /* !DEBUG */ - if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) + if (ifp->if_nextents != 1) return 0; - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) return 0; - ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(ifp->if_flags & XFS_IFEXTENTS); xfs_iext_first(ifp, &icur); xfs_iext_get_extent(ifp, &icur, &s); @@ -1501,10 +1487,11 @@ xfs_bmap_add_extent_delay_real( struct xfs_bmalloca *bma, int whichfork) { + struct xfs_mount *mp = bma->ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); struct xfs_bmbt_irec *new = &bma->got; int error; /* error return value */ int i; /* temp state */ - struct xfs_ifork *ifp; /* inode fork pointer */ xfs_fileoff_t new_endoff; /* end offset of new entry */ xfs_bmbt_irec_t r[3]; /* neighbor extent entries */ /* left is 0, right is 1, prev is 2 */ @@ -1514,16 +1501,9 @@ xfs_bmap_add_extent_delay_real( xfs_filblks_t da_old; /* old count del alloc blocks used */ xfs_filblks_t temp=0; /* value for da_new calculations */ int tmp_rval; /* partial logging flags */ - struct xfs_mount *mp; - xfs_extnum_t *nextents; struct xfs_bmbt_irec old; - mp = bma->ip->i_mount; - ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(whichfork != XFS_ATTR_FORK); - nextents = (whichfork == XFS_COW_FORK ? &bma->ip->i_cnextents : - &bma->ip->i_d.di_nextents); - ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || (bma->cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL)); @@ -1614,7 +1594,7 @@ xfs_bmap_add_extent_delay_real( xfs_iext_remove(bma->ip, &bma->icur, state); xfs_iext_prev(ifp, &bma->icur); xfs_iext_update_extent(bma->ip, state, &bma->icur, &LEFT); - (*nextents)--; + ifp->if_nextents--; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1718,8 +1698,8 @@ xfs_bmap_add_extent_delay_real( PREV.br_startblock = new->br_startblock; PREV.br_state = new->br_state; xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); + ifp->if_nextents++; - (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1784,7 +1764,8 @@ xfs_bmap_add_extent_delay_real( * The left neighbor is not contiguous. */ xfs_iext_update_extent(bma->ip, state, &bma->icur, new); - (*nextents)++; + ifp->if_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1870,7 +1851,8 @@ xfs_bmap_add_extent_delay_real( * The right neighbor is not contiguous. */ xfs_iext_update_extent(bma->ip, state, &bma->icur, new); - (*nextents)++; + ifp->if_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1955,7 +1937,7 @@ xfs_bmap_add_extent_delay_real( xfs_iext_next(ifp, &bma->icur); xfs_iext_insert(bma->ip, &bma->icur, &RIGHT, state); xfs_iext_insert(bma->ip, &bma->icur, &LEFT, state); - (*nextents)++; + ifp->if_nextents++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -2159,8 +2141,7 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &LEFT); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 2); + ifp->if_nextents -= 2; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2212,8 +2193,7 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &LEFT); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + ifp->if_nextents--; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2255,9 +2235,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &PREV); + ifp->if_nextents--; - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2364,8 +2343,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_update_extent(ip, state, icur, &PREV); xfs_iext_insert(ip, icur, new, state); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + ifp->if_nextents++; + if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2440,9 +2419,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_update_extent(ip, state, icur, &PREV); xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, new, state); + ifp->if_nextents++; - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2493,9 +2471,8 @@ xfs_bmap_add_extent_unwritten_real( xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, &r[1], state); xfs_iext_insert(ip, icur, &r[0], state); + ifp->if_nextents += 2; - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 2); if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2810,9 +2787,8 @@ xfs_bmap_add_extent_hole_real( xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, state, icur, &left); + ifp->if_nextents--; - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); if (cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { @@ -2910,8 +2886,8 @@ xfs_bmap_add_extent_hole_real( * Insert a new entry. */ xfs_iext_insert(ip, icur, new, state); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + ifp->if_nextents++; + if (cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { @@ -3891,7 +3867,8 @@ xfs_bmapi_read( int flags) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp; + int whichfork = xfs_bmapi_whichfork(flags); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_bmbt_irec got; xfs_fileoff_t obno; xfs_fileoff_t end; @@ -3899,48 +3876,23 @@ xfs_bmapi_read( int error; bool eof = false; int n = 0; - int whichfork = xfs_bmapi_whichfork(flags); ASSERT(*nmap >= 1); - ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| - XFS_BMAPI_COWFORK))); + ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_ENTIRE))); ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + if (WARN_ON_ONCE(!ifp)) + return -EFSCORRUPTED; + + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) return -EFSCORRUPTED; - } if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; XFS_STATS_INC(mp, xs_blk_mapr); - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!ifp) { - /* No CoW fork? Return a hole. */ - if (whichfork == XFS_COW_FORK) { - mval->br_startoff = bno; - mval->br_startblock = HOLESTARTBLOCK; - mval->br_blockcount = len; - mval->br_state = XFS_EXT_NORM; - *nmap = 1; - return 0; - } - - /* - * A missing attr ifork implies that the inode says we're in - * extents or btree format but failed to pass the inode fork - * verifier while trying to load it. Treat that as a file - * corruption too. - */ -#ifdef DEBUG - xfs_alert(mp, "%s: inode %llu missing fork %d", - __func__, ip->i_ino, whichfork); -#endif /* DEBUG */ - return -EFSCORRUPTED; - } - if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, whichfork); if (error) @@ -4193,17 +4145,7 @@ xfs_bmapi_allocate( bma->got.br_blockcount = bma->length; bma->got.br_state = XFS_EXT_NORM; - /* - * In the data fork, a wasdelay extent has been initialized, so - * shouldn't be flagged as unwritten. - * - * For the cow fork, however, we convert delalloc reservations - * (extents allocated for speculative preallocation) to - * allocated unwritten extents, and only convert the unwritten - * extents to real extents when we're about to write the data. - */ - if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) && - (bma->flags & XFS_BMAPI_PREALLOC)) + if (bma->flags & XFS_BMAPI_PREALLOC) bma->got.br_state = XFS_EXT_UNWRITTEN; if (bma->wasdel) @@ -4317,11 +4259,13 @@ xfs_bmapi_minleft( struct xfs_inode *ip, int fork) { + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, fork); + if (tp && tp->t_firstblock != NULLFSBLOCK) return 0; - if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE) + if (ifp->if_format != XFS_DINODE_FMT_BTREE) return 1; - return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1; + return be16_to_cpu(ifp->if_broot->bb_level) + 1; } /* @@ -4336,11 +4280,13 @@ xfs_bmapi_finish( int whichfork, int error) { + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + if ((bma->logflags & xfs_ilog_fext(whichfork)) && - XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + ifp->if_format != XFS_DINODE_FMT_EXTENTS) bma->logflags &= ~xfs_ilog_fext(whichfork); else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) && - XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE) + ifp->if_format != XFS_DINODE_FMT_BTREE) bma->logflags &= ~xfs_ilog_fbroot(whichfork); if (bma->logflags) @@ -4372,13 +4318,13 @@ xfs_bmapi_write( .total = total, }; struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp; + int whichfork = xfs_bmapi_whichfork(flags); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_fileoff_t end; /* end of mapped file region */ bool eof = false; /* after the end of extents */ int error; /* error return */ int n; /* current extent index */ xfs_fileoff_t obno; /* old block number (offset) */ - int whichfork; /* data or attr fork */ #ifdef DEBUG xfs_fileoff_t orig_bno; /* original block number value */ @@ -4393,13 +4339,12 @@ xfs_bmapi_write( orig_mval = mval; orig_nmap = *nmap; #endif - whichfork = xfs_bmapi_whichfork(flags); ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); ASSERT(tp != NULL); ASSERT(len > 0); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_format != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(!(flags & XFS_BMAPI_REMAP)); @@ -4415,7 +4360,7 @@ xfs_bmapi_write( ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) != (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -4423,8 +4368,6 @@ xfs_bmapi_write( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - ifp = XFS_IFORK_PTR(ip, whichfork); - XFS_STATS_INC(mp, xs_blk_mapw); if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -4534,9 +4477,8 @@ xfs_bmapi_write( if (error) goto error0; - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || - XFS_IFORK_NEXTENTS(ip, whichfork) > - XFS_IFORK_MAXEXT(ip, whichfork)); + ASSERT(ifp->if_format != XFS_DINODE_FMT_BTREE || + ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork)); xfs_bmapi_finish(&bma, whichfork, 0); xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, orig_nmap, *nmap); @@ -4611,8 +4553,23 @@ xfs_bmapi_convert_delalloc( bma.offset = bma.got.br_startoff; bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN); bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); + + /* + * When we're converting the delalloc reservations backing dirty pages + * in the page cache, we must be careful about how we create the new + * extents: + * + * New CoW fork extents are created unwritten, turned into real extents + * when we're about to write the data to disk, and mapped into the data + * fork after the write finishes. End of story. + * + * New data fork extents must be mapped in as unwritten and converted + * to real extents after the write succeeds to avoid exposing stale + * disk contents if we crash. + */ + bma.flags = XFS_BMAPI_PREALLOC; if (whichfork == XFS_COW_FORK) - bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; + bma.flags |= XFS_BMAPI_COWFORK; if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) bma.prev.br_startoff = NULLFILEOFF; @@ -4682,7 +4639,7 @@ xfs_bmapi_remap( ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) != (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -4726,9 +4683,9 @@ xfs_bmapi_remap( error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork); error0: - if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) + if (ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS) logflags &= ~XFS_ILOG_DEXT; - else if (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) + else if (ip->i_df.if_format != XFS_DINODE_FMT_BTREE) logflags &= ~XFS_ILOG_DBROOT; if (logflags) @@ -5078,9 +5035,8 @@ xfs_bmap_del_extent_real( * conversion to btree format, since the transaction will be dirty then. */ if (tp->t_blk_res == 0 && - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) >= - XFS_IFORK_MAXEXT(ip, whichfork) && + ifp->if_format == XFS_DINODE_FMT_EXTENTS && + ifp->if_nextents >= XFS_IFORK_MAXEXT(ip, whichfork) && del->br_startoff > got.br_startoff && del_endoff < got_endoff) return -ENOSPC; @@ -5132,8 +5088,8 @@ xfs_bmap_del_extent_real( */ xfs_iext_remove(ip, icur, state); xfs_iext_prev(ifp, icur); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + ifp->if_nextents--; + flags |= XFS_ILOG_CORE; if (!cur) { flags |= xfs_ilog_fext(whichfork); @@ -5241,8 +5197,8 @@ xfs_bmap_del_extent_real( } } else flags |= xfs_ilog_fext(whichfork); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + + ifp->if_nextents++; xfs_iext_next(ifp, icur); xfs_iext_insert(ip, icur, &new, state); break; @@ -5322,7 +5278,7 @@ __xfs_bunmapi( whichfork = xfs_bmapi_whichfork(flags); ASSERT(whichfork != XFS_COW_FORK); ifp = XFS_IFORK_PTR(ip, whichfork); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork))) + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) return -EFSCORRUPTED; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -5360,7 +5316,7 @@ __xfs_bunmapi( logflags = 0; if (ifp->if_flags & XFS_IFBROOT) { - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_ino.flags = 0; } else @@ -5605,10 +5561,10 @@ error0: * logging the extent records if we've converted to btree format. */ if ((logflags & xfs_ilog_fext(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + ifp->if_format != XFS_DINODE_FMT_EXTENTS) logflags &= ~xfs_ilog_fext(whichfork); else if ((logflags & xfs_ilog_fbroot(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) + ifp->if_format != XFS_DINODE_FMT_BTREE) logflags &= ~xfs_ilog_fbroot(whichfork); /* * Log inode even in the error case, if the transaction @@ -5690,6 +5646,7 @@ xfs_bmse_merge( struct xfs_btree_cur *cur, int *logflags) /* output */ { + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_bmbt_irec new; xfs_filblks_t blockcount; int error, i; @@ -5708,8 +5665,7 @@ xfs_bmse_merge( * Update the on-disk extent count, the btree if necessary and log the * inode. */ - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + ifp->if_nextents--; *logflags |= XFS_ILOG_CORE; if (!cur) { *logflags |= XFS_ILOG_DEXT; @@ -5747,7 +5703,7 @@ xfs_bmse_merge( done: xfs_iext_remove(ip, icur, 0); - xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur); + xfs_iext_prev(ifp, icur); xfs_iext_update_extent(ip, xfs_bmap_fork_to_state(whichfork), icur, &new); @@ -5819,7 +5775,7 @@ xfs_bmap_collapse_extents( int error = 0; int logflags = 0; - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -5936,7 +5892,7 @@ xfs_bmap_insert_extents( int error = 0; int logflags = 0; - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -6030,18 +5986,18 @@ xfs_bmap_split_extent( xfs_fileoff_t split_fsb) { int whichfork = XFS_DATA_FORK; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_irec got; struct xfs_bmbt_irec new; /* split extent */ struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp; xfs_fsblock_t gotblkcnt; /* new block count for got */ struct xfs_iext_cursor icur; int error = 0; int logflags = 0; int i = 0; - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, whichfork)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { return -EFSCORRUPTED; } @@ -6049,7 +6005,6 @@ xfs_bmap_split_extent( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - ifp = XFS_IFORK_PTR(ip, whichfork); if (!(ifp->if_flags & XFS_IFEXTENTS)) { /* Read in all the extents */ error = xfs_iread_extents(tp, ip, whichfork); @@ -6097,8 +6052,7 @@ xfs_bmap_split_extent( /* Add new extent */ xfs_iext_next(ifp, &icur); xfs_iext_insert(ip, &icur, &new, 0); - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + ifp->if_nextents++; if (cur) { error = xfs_bmbt_lookup_eq(cur, &new, &i); diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index f3259ad5c22c..6028a3c825ba 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 295a59cf8840..d9c63f17d2de 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -636,10 +636,7 @@ xfs_bmbt_change_owner( ASSERT(tp || buffer_list); ASSERT(!(tp && buffer_list)); - if (whichfork == XFS_DATA_FORK) - ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE); - else - ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE); + ASSERT(XFS_IFORK_PTR(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); if (!cur) diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 29b407d053b4..72bf74c79fb9 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002-2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 8626c5a81aad..10e50cbacacf 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 53e503b6f186..6e25de6621e4 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 08c0a4d98b89..059ac108b1b3 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * Copyright (c) 2013 Red Hat, Inc. diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 22557527cfdb..d8f586256add 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -178,6 +178,18 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, }; +static void +xfs_defer_create_intent( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp, + bool sort) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + + dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, + dfp->dfp_count, sort); +} + /* * For each pending item in the intake list, log its intent item and the * associated extents, then add the entire intake list to the end of @@ -187,17 +199,11 @@ STATIC void xfs_defer_create_intents( struct xfs_trans *tp) { - struct list_head *li; struct xfs_defer_pending *dfp; - const struct xfs_defer_op_type *ops; list_for_each_entry(dfp, &tp->t_dfops, dfp_list) { - ops = defer_op_types[dfp->dfp_type]; - dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count); trace_xfs_defer_create_intent(tp->t_mountp, dfp); - list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items); - list_for_each(li, &dfp->dfp_work) - ops->log_item(tp, dfp->dfp_intent, li); + xfs_defer_create_intent(tp, dfp, true); } } @@ -234,10 +240,13 @@ xfs_defer_trans_roll( struct xfs_log_item *lip; struct xfs_buf *bplist[XFS_DEFER_OPS_NR_BUFS]; struct xfs_inode *iplist[XFS_DEFER_OPS_NR_INODES]; + unsigned int ordered = 0; /* bitmap */ int bpcount = 0, ipcount = 0; int i; int error; + BUILD_BUG_ON(NBBY * sizeof(ordered) < XFS_DEFER_OPS_NR_BUFS); + list_for_each_entry(lip, &tp->t_items, li_trans) { switch (lip->li_type) { case XFS_LI_BUF: @@ -248,7 +257,10 @@ xfs_defer_trans_roll( ASSERT(0); return -EFSCORRUPTED; } - xfs_trans_dirty_buf(tp, bli->bli_buf); + if (bli->bli_flags & XFS_BLI_ORDERED) + ordered |= (1U << bpcount); + else + xfs_trans_dirty_buf(tp, bli->bli_buf); bplist[bpcount++] = bli->bli_buf; } break; @@ -289,6 +301,8 @@ xfs_defer_trans_roll( /* Rejoin the buffers and dirty them so the log moves forward. */ for (i = 0; i < bpcount; i++) { xfs_trans_bjoin(tp, bplist[i]); + if (ordered & (1U << i)) + xfs_trans_ordered_buf(tp, bplist[i]); xfs_trans_bhold(tp, bplist[i]); } @@ -346,6 +360,53 @@ xfs_defer_cancel_list( } /* + * Log an intent-done item for the first pending intent, and finish the work + * items. + */ +static int +xfs_defer_finish_one( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + struct xfs_btree_cur *state = NULL; + struct list_head *li, *n; + int error; + + trace_xfs_defer_pending_finish(tp->t_mountp, dfp); + + dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); + list_for_each_safe(li, n, &dfp->dfp_work) { + list_del(li); + dfp->dfp_count--; + error = ops->finish_item(tp, dfp->dfp_done, li, &state); + if (error == -EAGAIN) { + /* + * Caller wants a fresh transaction; put the work item + * back on the list and log a new log intent item to + * replace the old one. See "Requesting a Fresh + * Transaction while Finishing Deferred Work" above. + */ + list_add(li, &dfp->dfp_work); + dfp->dfp_count++; + dfp->dfp_done = NULL; + xfs_defer_create_intent(tp, dfp, false); + } + + if (error) + goto out; + } + + /* Done with the dfp, free it. */ + list_del(&dfp->dfp_list); + kmem_free(dfp); +out: + if (ops->finish_cleanup) + ops->finish_cleanup(tp, state, error); + return error; +} + +/* * Finish all the pending work. This involves logging intent items for * any work items that wandered in since the last transaction roll (if * one has even happened), rolling the transaction, and finishing the @@ -358,11 +419,7 @@ xfs_defer_finish_noroll( struct xfs_trans **tp) { struct xfs_defer_pending *dfp; - struct list_head *li; - struct list_head *n; - void *state; int error = 0; - const struct xfs_defer_op_type *ops; LIST_HEAD(dop_pending); ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -371,87 +428,30 @@ xfs_defer_finish_noroll( /* Until we run out of pending work to finish... */ while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) { - /* log intents and pull in intake items */ xfs_defer_create_intents(*tp); list_splice_tail_init(&(*tp)->t_dfops, &dop_pending); - /* - * Roll the transaction. - */ error = xfs_defer_trans_roll(tp); if (error) - goto out; + goto out_shutdown; - /* Log an intent-done item for the first pending item. */ dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, dfp_list); - ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp); - dfp->dfp_done = ops->create_done(*tp, dfp->dfp_intent, - dfp->dfp_count); - - /* Finish the work items. */ - state = NULL; - list_for_each_safe(li, n, &dfp->dfp_work) { - list_del(li); - dfp->dfp_count--; - error = ops->finish_item(*tp, li, dfp->dfp_done, - &state); - if (error == -EAGAIN) { - /* - * Caller wants a fresh transaction; - * put the work item back on the list - * and jump out. - */ - list_add(li, &dfp->dfp_work); - dfp->dfp_count++; - break; - } else if (error) { - /* - * Clean up after ourselves and jump out. - * xfs_defer_cancel will take care of freeing - * all these lists and stuff. - */ - if (ops->finish_cleanup) - ops->finish_cleanup(*tp, state, error); - goto out; - } - } - if (error == -EAGAIN) { - /* - * Caller wants a fresh transaction, so log a - * new log intent item to replace the old one - * and roll the transaction. See "Requesting - * a Fresh Transaction while Finishing - * Deferred Work" above. - */ - dfp->dfp_intent = ops->create_intent(*tp, - dfp->dfp_count); - dfp->dfp_done = NULL; - list_for_each(li, &dfp->dfp_work) - ops->log_item(*tp, dfp->dfp_intent, li); - } else { - /* Done with the dfp, free it. */ - list_del(&dfp->dfp_list); - kmem_free(dfp); - } - - if (ops->finish_cleanup) - ops->finish_cleanup(*tp, state, error); - } - -out: - if (error) { - xfs_defer_trans_abort(*tp, &dop_pending); - xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); - trace_xfs_defer_finish_error(*tp, error); - xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending); - xfs_defer_cancel(*tp); - return error; + error = xfs_defer_finish_one(*tp, dfp); + if (error && error != -EAGAIN) + goto out_shutdown; } trace_xfs_defer_finish_done(*tp, _RET_IP_); return 0; + +out_shutdown: + xfs_defer_trans_abort(*tp, &dop_pending); + xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); + trace_xfs_defer_finish_error(*tp, error); + xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending); + xfs_defer_cancel(*tp); + return error; } int diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 7c28d7608ac6..6b2ca580f2b0 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Copyright (C) 2016 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> @@ -6,6 +6,7 @@ #ifndef __XFS_DEFER_H__ #define __XFS_DEFER_H__ +struct xfs_btree_cur; struct xfs_defer_op_type; /* @@ -28,8 +29,8 @@ enum xfs_defer_ops_type { struct xfs_defer_pending { struct list_head dfp_list; /* pending items */ struct list_head dfp_work; /* work items */ - void *dfp_intent; /* log intent item */ - void *dfp_done; /* log done item */ + struct xfs_log_item *dfp_intent; /* log intent item */ + struct xfs_log_item *dfp_done; /* log done item */ unsigned int dfp_count; /* # extent items */ enum xfs_defer_ops_type dfp_type; }; @@ -43,15 +44,16 @@ void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp); /* Description of a deferred type. */ struct xfs_defer_op_type { - void (*abort_intent)(void *); - void *(*create_done)(struct xfs_trans *, void *, unsigned int); - int (*finish_item)(struct xfs_trans *, struct list_head *, void *, - void **); - void (*finish_cleanup)(struct xfs_trans *, void *, int); - void (*cancel_item)(struct list_head *); - int (*diff_items)(void *, struct list_head *, struct list_head *); - void *(*create_intent)(struct xfs_trans *, uint); - void (*log_item)(struct xfs_trans *, void *, struct list_head *); + struct xfs_log_item *(*create_intent)(struct xfs_trans *tp, + struct list_head *items, unsigned int count, bool sort); + void (*abort_intent)(struct xfs_log_item *intent); + struct xfs_log_item *(*create_done)(struct xfs_trans *tp, + struct xfs_log_item *intent, unsigned int count); + int (*finish_item)(struct xfs_trans *tp, struct xfs_log_item *done, + struct list_head *item, struct xfs_btree_cur **state); + void (*finish_cleanup)(struct xfs_trans *tp, + struct xfs_btree_cur *state, int error); + void (*cancel_item)(struct list_head *item); unsigned int max_items; }; diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index dd6fcaaea318..612a9c5e41b1 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -278,7 +278,7 @@ xfs_dir_createname( if (!inum) args->op_flags |= XFS_DA_OP_JUSTCHECK; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_addname(args); goto out_free; } @@ -373,7 +373,7 @@ xfs_dir_lookup( args->op_flags |= XFS_DA_OP_CILOOKUP; lock_mode = xfs_ilock_data_map_shared(dp); - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_lookup(args); goto out_check_rval; } @@ -443,7 +443,7 @@ xfs_dir_removename( args->whichfork = XFS_DATA_FORK; args->trans = tp; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_removename(args); goto out_free; } @@ -504,7 +504,7 @@ xfs_dir_replace( args->whichfork = XFS_DATA_FORK; args->trans = tp; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_replace(args); goto out_free; } diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 033777e282f2..e55378640b05 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 1dbf2f980a26..5b59d3f7746b 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -1104,7 +1104,7 @@ xfs_dir2_sf_to_block( ASSERT(ifp->if_bytes == dp->i_d.di_size); ASSERT(ifp->if_u1.if_data != NULL); ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); - ASSERT(dp->i_d.di_nextents == 0); + ASSERT(dp->i_df.if_nextents == 0); /* * Copy the directory into a temporary buffer. diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 01ee0b926572..44c6a77cba05 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 7b7f6fb2ea3b..2463b5d73447 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -343,7 +343,7 @@ xfs_dir2_block_to_sf( */ ASSERT(dp->i_df.if_bytes == 0); xfs_init_local_fork(dp, XFS_DATA_FORK, sfp, size); - dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + dp->i_df.if_format = XFS_DINODE_FMT_LOCAL; dp->i_d.di_size = size; logflags |= XFS_ILOG_DDATA; @@ -710,11 +710,11 @@ xfs_dir2_sf_verify( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); struct xfs_dir2_sf_hdr *sfp; struct xfs_dir2_sf_entry *sfep; struct xfs_dir2_sf_entry *next_sfep; char *endp; - struct xfs_ifork *ifp; xfs_ino_t ino; int i; int i8count; @@ -723,9 +723,8 @@ xfs_dir2_sf_verify( int error; uint8_t filetype; - ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data; size = ifp->if_bytes; @@ -827,9 +826,9 @@ xfs_dir2_sf_create( * If it's currently a zero-length extent file, * convert it to local format. */ - if (dp->i_d.di_format == XFS_DINODE_FMT_EXTENTS) { + if (dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS) { dp->i_df.if_flags &= ~XFS_IFEXTENTS; /* just in case */ - dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + dp->i_df.if_format = XFS_DINODE_FMT_LOCAL; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); dp->i_df.if_flags |= XFS_IFINLINE; } @@ -1027,7 +1026,7 @@ xfs_dir2_sf_replace_needblock( int newsize; struct xfs_dir2_sf_hdr *sfp; - if (dp->i_d.di_format != XFS_DINODE_FMT_LOCAL) + if (dp->i_df.if_format != XFS_DINODE_FMT_LOCAL) return false; sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 79e6c4fb1d8a..53b305dea381 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * Copyright (C) 2017 Oracle. @@ -55,7 +55,8 @@ #define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32 #define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33 #define XFS_ERRTAG_IUNLINK_FALLBACK 34 -#define XFS_ERRTAG_MAX 35 +#define XFS_ERRTAG_BUF_IOERROR 35 +#define XFS_ERRTAG_MAX 36 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -95,5 +96,6 @@ #define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 #define XFS_RANDOM_FORCE_SUMMARY_RECALC 1 #define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10) +#define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 045556e78ee2..b42a52bfa1e9 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. @@ -964,13 +964,12 @@ enum xfs_dinode_fmt { /* * Inode data & attribute fork sizes, per inode. */ -#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0) #define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3)) #define XFS_DFORK_DSIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? XFS_DFORK_BOFF(dip) : XFS_LITINO(mp)) + ((dip)->di_forkoff ? XFS_DFORK_BOFF(dip) : XFS_LITINO(mp)) #define XFS_DFORK_ASIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : 0) + ((dip)->di_forkoff ? XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : 0) #define XFS_DFORK_SIZE(dip,mp,w) \ ((w) == XFS_DATA_FORK ? \ XFS_DFORK_DSIZE(dip, mp) : \ @@ -1681,7 +1680,7 @@ struct xfs_acl_entry { struct xfs_acl { __be32 acl_cnt; - struct xfs_acl_entry acl_entry[0]; + struct xfs_acl_entry acl_entry[]; }; /* diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 245188e4f6d3..84bcffa87753 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: LGPL-2.1 +/* SPDX-License-Identifier: LGPL-2.1 */ /* * Copyright (c) 1995-2005 Silicon Graphics, Inc. * All Rights Reserved. diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 272005ac8c88..99e796256c5d 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Copyright (C) 2019 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 39c5a6e24915..6f84ea85fdd8 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -161,8 +161,7 @@ xfs_imap_to_bp( struct xfs_imap *imap, struct xfs_dinode **dipp, struct xfs_buf **bpp, - uint buf_flags, - uint iget_flags) + uint buf_flags) { struct xfs_buf *bp; int error; @@ -172,12 +171,7 @@ xfs_imap_to_bp( (int)imap->im_len, buf_flags, &bp, &xfs_inode_buf_ops); if (error) { - if (error == -EAGAIN) { - ASSERT(buf_flags & XBF_TRYLOCK); - return error; - } - xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", - __func__, error); + ASSERT(error != -EAGAIN || (buf_flags & XBF_TRYLOCK)); return error; } @@ -186,13 +180,36 @@ xfs_imap_to_bp( return 0; } -void +int xfs_inode_from_disk( struct xfs_inode *ip, struct xfs_dinode *from) { struct xfs_icdinode *to = &ip->i_d; struct inode *inode = VFS_I(ip); + int error; + xfs_failaddr_t fa; + + ASSERT(ip->i_cowfp == NULL); + ASSERT(ip->i_afp == NULL); + + fa = xfs_dinode_verify(ip->i_mount, ip->i_ino, from); + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, "dinode", from, + sizeof(*from), fa); + return -EFSCORRUPTED; + } + + /* + * First get the permanent information that is needed to allocate an + * inode. If the inode is unused, mode is zero and we shouldn't mess + * with the unitialized part of it. + */ + to->di_flushiter = be16_to_cpu(from->di_flushiter); + inode->i_generation = be32_to_cpu(from->di_gen); + inode->i_mode = be16_to_cpu(from->di_mode); + if (!inode->i_mode) + return 0; /* * Convert v1 inodes immediately to v2 inode format as this is the @@ -208,10 +225,8 @@ xfs_inode_from_disk( be16_to_cpu(from->di_projid_lo); } - to->di_format = from->di_format; i_uid_write(inode, be32_to_cpu(from->di_uid)); i_gid_write(inode, be32_to_cpu(from->di_gid)); - to->di_flushiter = be16_to_cpu(from->di_flushiter); /* * Time is signed, so need to convert to signed 32 bit before @@ -225,16 +240,11 @@ xfs_inode_from_disk( inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec); inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec); inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec); - inode->i_generation = be32_to_cpu(from->di_gen); - inode->i_mode = be16_to_cpu(from->di_mode); to->di_size = be64_to_cpu(from->di_size); to->di_nblocks = be64_to_cpu(from->di_nblocks); to->di_extsize = be32_to_cpu(from->di_extsize); - to->di_nextents = be32_to_cpu(from->di_nextents); - to->di_anextents = be16_to_cpu(from->di_anextents); to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; to->di_dmevmask = be32_to_cpu(from->di_dmevmask); to->di_dmstate = be16_to_cpu(from->di_dmstate); to->di_flags = be16_to_cpu(from->di_flags); @@ -247,6 +257,22 @@ xfs_inode_from_disk( to->di_flags2 = be64_to_cpu(from->di_flags2); to->di_cowextsize = be32_to_cpu(from->di_cowextsize); } + + error = xfs_iformat_data_fork(ip, from); + if (error) + return error; + if (from->di_forkoff) { + error = xfs_iformat_attr_fork(ip, from); + if (error) + goto out_destroy_data_fork; + } + if (xfs_is_reflink_inode(ip)) + xfs_ifork_init_cow(ip); + return 0; + +out_destroy_data_fork: + xfs_idestroy_fork(&ip->i_df); + return error; } void @@ -261,7 +287,7 @@ xfs_inode_to_disk( to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); to->di_onlink = 0; - to->di_format = from->di_format; + to->di_format = xfs_ifork_format(&ip->i_df); to->di_uid = cpu_to_be32(i_uid_read(inode)); to->di_gid = cpu_to_be32(i_gid_read(inode)); to->di_projid_lo = cpu_to_be16(from->di_projid & 0xffff); @@ -281,10 +307,10 @@ xfs_inode_to_disk( to->di_size = cpu_to_be64(from->di_size); to->di_nblocks = cpu_to_be64(from->di_nblocks); to->di_extsize = cpu_to_be32(from->di_extsize); - to->di_nextents = cpu_to_be32(from->di_nextents); - to->di_anextents = cpu_to_be16(from->di_anextents); + to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); + to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp)); to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; + to->di_aformat = xfs_ifork_format(ip->i_afp); to->di_dmevmask = cpu_to_be32(from->di_dmevmask); to->di_dmstate = cpu_to_be16(from->di_dmstate); to->di_flags = cpu_to_be16(from->di_flags); @@ -405,7 +431,7 @@ xfs_dinode_verify_forkoff( struct xfs_dinode *dip, struct xfs_mount *mp) { - if (!XFS_DFORK_Q(dip)) + if (!dip->di_forkoff) return NULL; switch (dip->di_format) { @@ -508,7 +534,7 @@ xfs_dinode_verify( return __this_address; } - if (XFS_DFORK_Q(dip)) { + if (dip->di_forkoff) { fa = xfs_dinode_verify_fork(dip, mp, XFS_ATTR_FORK); if (fa) return fa; @@ -585,122 +611,6 @@ xfs_dinode_calc_crc( } /* - * Read the disk inode attributes into the in-core inode structure. - * - * For version 5 superblocks, if we are initialising a new inode and we are not - * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new - * inode core with a random generation number. If we are keeping inodes around, - * we need to read the inode cluster to get the existing generation number off - * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode - * format) then log recovery is dependent on the di_flushiter field being - * initialised from the current on-disk value and hence we must also read the - * inode off disk. - */ -int -xfs_iread( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_inode_t *ip, - uint iget_flags) -{ - xfs_buf_t *bp; - xfs_dinode_t *dip; - xfs_failaddr_t fa; - int error; - - /* - * Fill in the location information in the in-core inode. - */ - error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); - if (error) - return error; - - /* shortcut IO on inode allocation if possible */ - if ((iget_flags & XFS_IGET_CREATE) && - xfs_sb_version_has_v3inode(&mp->m_sb) && - !(mp->m_flags & XFS_MOUNT_IKEEP)) { - VFS_I(ip)->i_generation = prandom_u32(); - return 0; - } - - /* - * Get pointers to the on-disk inode and the buffer containing it. - */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags); - if (error) - return error; - - /* even unallocated inodes are verified */ - fa = xfs_dinode_verify(mp, ip->i_ino, dip); - if (fa) { - xfs_inode_verifier_error(ip, -EFSCORRUPTED, "dinode", dip, - sizeof(*dip), fa); - error = -EFSCORRUPTED; - goto out_brelse; - } - - /* - * If the on-disk inode is already linked to a directory - * entry, copy all of the inode into the in-core inode. - * xfs_iformat_fork() handles copying in the inode format - * specific information. - * Otherwise, just get the truly permanent information. - */ - if (dip->di_mode) { - xfs_inode_from_disk(ip, dip); - error = xfs_iformat_fork(ip, dip); - if (error) { -#ifdef DEBUG - xfs_alert(mp, "%s: xfs_iformat() returned error %d", - __func__, error); -#endif /* DEBUG */ - goto out_brelse; - } - } else { - /* - * Partial initialisation of the in-core inode. Just the bits - * that xfs_ialloc won't overwrite or relies on being correct. - */ - VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen); - ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); - - /* - * Make sure to pull in the mode here as well in - * case the inode is released without being used. - * This ensures that xfs_inactive() will see that - * the inode is already free and not try to mess - * with the uninitialized part of it. - */ - VFS_I(ip)->i_mode = 0; - } - - ip->i_delayed_blks = 0; - - /* - * Mark the buffer containing the inode as something to keep - * around for a while. This helps to keep recently accessed - * meta-data in-core longer. - */ - xfs_buf_set_ref(bp, XFS_INO_REF); - - /* - * Use xfs_trans_brelse() to release the buffer containing the on-disk - * inode, because it was acquired with xfs_trans_read_buf() in - * xfs_imap_to_bp() above. If tp is NULL, this is just a normal - * brelse(). If we're within a transaction, then xfs_trans_brelse() - * will only release the buffer if it is not dirty within the - * transaction. It will be OK to release the buffer in this case, - * because inodes on disk are never destroyed and we will be locking the - * new in-core inode before putting it in the cache where other - * processes can find it. Thus we don't have to worry about the inode - * being changed just because we released the buffer. - */ - out_brelse: - xfs_trans_brelse(tp, bp); - return error; -} - -/* * Validate di_extsize hint. * * The rules are documented at xfs_ioctl_setattr_check_extsize(). diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 9b373dcf9e34..865ac493c72a 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -16,16 +16,12 @@ struct xfs_dinode; * format specific structures at the appropriate time. */ struct xfs_icdinode { - int8_t di_format; /* format of di_c data */ uint16_t di_flushiter; /* incremented on flush */ uint32_t di_projid; /* owner's project id */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ - xfs_extnum_t di_nextents; /* number of extents in data fork */ - xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ - int8_t di_aformat; /* format of attr fork's data */ uint32_t di_dmevmask; /* DMIG event mask */ uint16_t di_dmstate; /* DMIG state info */ uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ @@ -48,13 +44,11 @@ struct xfs_imap { int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, struct xfs_imap *, struct xfs_dinode **, - struct xfs_buf **, uint, uint); -int xfs_iread(struct xfs_mount *, struct xfs_trans *, - struct xfs_inode *, uint); + struct xfs_buf **, uint); void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to, xfs_lsn_t lsn); -void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); +int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); void xfs_log_dinode_to_disk(struct xfs_log_dinode *from, struct xfs_dinode *to); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 518c6f0ec3a6..28b366275ae0 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -26,110 +26,6 @@ kmem_zone_t *xfs_ifork_zone; -STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); -STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); -STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); - -/* - * Copy inode type and data and attr format specific information from the - * on-disk inode to the in-core inode and fork structures. For fifos, devices, - * and sockets this means set i_rdev to the proper value. For files, - * directories, and symlinks this means to bring in the in-line data or extent - * pointers as well as the attribute fork. For a fork in B-tree format, only - * the root is immediately brought in-core. The rest will be read in later when - * first referenced (see xfs_iread_extents()). - */ -int -xfs_iformat_fork( - struct xfs_inode *ip, - struct xfs_dinode *dip) -{ - struct inode *inode = VFS_I(ip); - struct xfs_attr_shortform *atp; - int size; - int error = 0; - xfs_fsize_t di_size; - - switch (inode->i_mode & S_IFMT) { - case S_IFIFO: - case S_IFCHR: - case S_IFBLK: - case S_IFSOCK: - ip->i_d.di_size = 0; - inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip)); - break; - - case S_IFREG: - case S_IFLNK: - case S_IFDIR: - switch (dip->di_format) { - case XFS_DINODE_FMT_LOCAL: - di_size = be64_to_cpu(dip->di_size); - size = (int)di_size; - error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); - break; - default: - xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, - dip, sizeof(*dip), __this_address); - return -EFSCORRUPTED; - } - break; - - default: - xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, - sizeof(*dip), __this_address); - return -EFSCORRUPTED; - } - if (error) - return error; - - if (xfs_is_reflink_inode(ip)) { - ASSERT(ip->i_cowfp == NULL); - xfs_ifork_init_cow(ip); - } - - if (!XFS_DFORK_Q(dip)) - return 0; - - ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS); - - switch (dip->di_aformat) { - case XFS_DINODE_FMT_LOCAL: - atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); - size = be16_to_cpu(atp->hdr.totsize); - - error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); - break; - default: - xfs_inode_verifier_error(ip, error, __func__, dip, - sizeof(*dip), __this_address); - error = -EFSCORRUPTED; - break; - } - if (error) { - kmem_cache_free(xfs_ifork_zone, ip->i_afp); - ip->i_afp = NULL; - if (ip->i_cowfp) - kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); - ip->i_cowfp = NULL; - xfs_idestroy_fork(ip, XFS_DATA_FORK); - } - return error; -} - void xfs_init_local_fork( struct xfs_inode *ip, @@ -292,12 +188,11 @@ xfs_iformat_btree( * or the number of extents is greater than the number of * blocks. */ - if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= - XFS_IFORK_MAXEXT(ip, whichfork) || + if (unlikely(ifp->if_nextents <= XFS_IFORK_MAXEXT(ip, whichfork) || nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > XFS_DFORK_SIZE(dip, mp, whichfork) || - XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) || + ifp->if_nextents > ip->i_d.di_nblocks) || level == 0 || level > XFS_BTREE_MAXLEVELS) { xfs_warn(mp, "corrupt inode %Lu (btree).", (unsigned long long) ip->i_ino); @@ -325,6 +220,110 @@ xfs_iformat_btree( return 0; } +int +xfs_iformat_data_fork( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + struct inode *inode = VFS_I(ip); + int error; + + /* + * Initialize the extent count early, as the per-format routines may + * depend on it. + */ + ip->i_df.if_format = dip->di_format; + ip->i_df.if_nextents = be32_to_cpu(dip->di_nextents); + + switch (inode->i_mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + ip->i_d.di_size = 0; + inode->i_rdev = xfs_to_linux_dev_t(xfs_dinode_get_rdev(dip)); + return 0; + case S_IFREG: + case S_IFLNK: + case S_IFDIR: + switch (ip->i_df.if_format) { + case XFS_DINODE_FMT_LOCAL: + error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, + be64_to_cpu(dip->di_size)); + if (!error) + error = xfs_ifork_verify_local_data(ip); + return error; + case XFS_DINODE_FMT_EXTENTS: + return xfs_iformat_extents(ip, dip, XFS_DATA_FORK); + case XFS_DINODE_FMT_BTREE: + return xfs_iformat_btree(ip, dip, XFS_DATA_FORK); + default: + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, + dip, sizeof(*dip), __this_address); + return -EFSCORRUPTED; + } + break; + default: + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, + sizeof(*dip), __this_address); + return -EFSCORRUPTED; + } +} + +static uint16_t +xfs_dfork_attr_shortform_size( + struct xfs_dinode *dip) +{ + struct xfs_attr_shortform *atp = + (struct xfs_attr_shortform *)XFS_DFORK_APTR(dip); + + return be16_to_cpu(atp->hdr.totsize); +} + +int +xfs_iformat_attr_fork( + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + int error = 0; + + /* + * Initialize the extent count early, as the per-format routines may + * depend on it. + */ + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS); + ip->i_afp->if_format = dip->di_aformat; + if (unlikely(ip->i_afp->if_format == 0)) /* pre IRIX 6.2 file system */ + ip->i_afp->if_format = XFS_DINODE_FMT_EXTENTS; + ip->i_afp->if_nextents = be16_to_cpu(dip->di_anextents); + + switch (ip->i_afp->if_format) { + case XFS_DINODE_FMT_LOCAL: + error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, + xfs_dfork_attr_shortform_size(dip)); + if (!error) + error = xfs_ifork_verify_local_attr(ip); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); + break; + default: + xfs_inode_verifier_error(ip, error, __func__, dip, + sizeof(*dip), __this_address); + error = -EFSCORRUPTED; + break; + } + + if (error) { + kmem_cache_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + } + return error; +} + /* * Reallocate the space for if_broot based on the number of records * being added or deleted as indicated in rec_diff. Move the records @@ -504,38 +503,24 @@ xfs_idata_realloc( void xfs_idestroy_fork( - xfs_inode_t *ip, - int whichfork) + struct xfs_ifork *ifp) { - struct xfs_ifork *ifp; - - ifp = XFS_IFORK_PTR(ip, whichfork); if (ifp->if_broot != NULL) { kmem_free(ifp->if_broot); ifp->if_broot = NULL; } /* - * If the format is local, then we can't have an extents - * array so just look for an inline data array. If we're - * not local then we may or may not have an extents list, - * so check and free it up if we do. + * If the format is local, then we can't have an extents array so just + * look for an inline data array. If we're not local then we may or may + * not have an extents list, so check and free it up if we do. */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - if (ifp->if_u1.if_data != NULL) { - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = NULL; - } - } else if ((ifp->if_flags & XFS_IFEXTENTS) && ifp->if_height) { - xfs_iext_destroy(ifp); - } - - if (whichfork == XFS_ATTR_FORK) { - kmem_cache_free(xfs_ifork_zone, ip->i_afp); - ip->i_afp = NULL; - } else if (whichfork == XFS_COW_FORK) { - kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); - ip->i_cowfp = NULL; + if (ifp->if_format == XFS_DINODE_FMT_LOCAL) { + kmem_free(ifp->if_u1.if_data); + ifp->if_u1.if_data = NULL; + } else if (ifp->if_flags & XFS_IFEXTENTS) { + if (ifp->if_height) + xfs_iext_destroy(ifp); } } @@ -592,7 +577,7 @@ void xfs_iflush_fork( xfs_inode_t *ip, xfs_dinode_t *dip, - xfs_inode_log_item_t *iip, + struct xfs_inode_log_item *iip, int whichfork) { char *cp; @@ -618,7 +603,7 @@ xfs_iflush_fork( } cp = XFS_DFORK_PTR(dip, whichfork); mp = ip->i_mount; - switch (XFS_IFORK_FORMAT(ip, whichfork)) { + switch (ifp->if_format) { case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & dataflag[whichfork]) && (ifp->if_bytes > 0)) { @@ -633,7 +618,7 @@ xfs_iflush_fork( !(iip->ili_fields & extflag[whichfork])); if ((iip->ili_fields & extflag[whichfork]) && (ifp->if_bytes > 0)) { - ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); + ASSERT(ifp->if_nextents > 0); (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, whichfork); } @@ -691,48 +676,55 @@ xfs_ifork_init_cow( ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone, KM_NOFS); ip->i_cowfp->if_flags = XFS_IFEXTENTS; - ip->i_cformat = XFS_DINODE_FMT_EXTENTS; - ip->i_cnextents = 0; + ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS; } -/* Default fork content verifiers. */ -struct xfs_ifork_ops xfs_default_ifork_ops = { - .verify_attr = xfs_attr_shortform_verify, - .verify_dir = xfs_dir2_sf_verify, - .verify_symlink = xfs_symlink_shortform_verify, -}; - /* Verify the inline contents of the data fork of an inode. */ -xfs_failaddr_t -xfs_ifork_verify_data( - struct xfs_inode *ip, - struct xfs_ifork_ops *ops) +int +xfs_ifork_verify_local_data( + struct xfs_inode *ip) { - /* Non-local data fork, we're done. */ - if (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) - return NULL; + xfs_failaddr_t fa = NULL; - /* Check the inline data fork if there is one. */ switch (VFS_I(ip)->i_mode & S_IFMT) { case S_IFDIR: - return ops->verify_dir(ip); + fa = xfs_dir2_sf_verify(ip); + break; case S_IFLNK: - return ops->verify_symlink(ip); + fa = xfs_symlink_shortform_verify(ip); + break; default: - return NULL; + break; } + + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork", + ip->i_df.if_u1.if_data, ip->i_df.if_bytes, fa); + return -EFSCORRUPTED; + } + + return 0; } /* Verify the inline contents of the attr fork of an inode. */ -xfs_failaddr_t -xfs_ifork_verify_attr( - struct xfs_inode *ip, - struct xfs_ifork_ops *ops) +int +xfs_ifork_verify_local_attr( + struct xfs_inode *ip) { - /* There has to be an attr fork allocated if aformat is local. */ - if (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) - return NULL; - if (!XFS_IFORK_PTR(ip, XFS_ATTR_FORK)) - return __this_address; - return ops->verify_attr(ip); + struct xfs_ifork *ifp = ip->i_afp; + xfs_failaddr_t fa; + + if (!ifp) + fa = __this_address; + else + fa = xfs_attr_shortform_verify(ip); + + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", + ifp ? ifp->if_u1.if_data : NULL, + ifp ? ifp->if_bytes : 0, fa); + return -EFSCORRUPTED; + } + + return 0; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 668ee942be22..a4953e95c4f3 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -23,6 +23,8 @@ struct xfs_ifork { } if_u1; short if_broot_bytes; /* bytes allocated for root */ unsigned char if_flags; /* per-fork flags */ + int8_t if_format; /* format of this fork */ + xfs_extnum_t if_nextents; /* # of extents in this fork */ }; /* @@ -55,43 +57,36 @@ struct xfs_ifork { ((w) == XFS_ATTR_FORK ? \ XFS_IFORK_ASIZE(ip) : \ 0)) -#define XFS_IFORK_FORMAT(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - (ip)->i_d.di_format : \ - ((w) == XFS_ATTR_FORK ? \ - (ip)->i_d.di_aformat : \ - (ip)->i_cformat)) -#define XFS_IFORK_FMT_SET(ip,w,n) \ - ((w) == XFS_DATA_FORK ? \ - ((ip)->i_d.di_format = (n)) : \ - ((w) == XFS_ATTR_FORK ? \ - ((ip)->i_d.di_aformat = (n)) : \ - ((ip)->i_cformat = (n)))) -#define XFS_IFORK_NEXTENTS(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - (ip)->i_d.di_nextents : \ - ((w) == XFS_ATTR_FORK ? \ - (ip)->i_d.di_anextents : \ - (ip)->i_cnextents)) -#define XFS_IFORK_NEXT_SET(ip,w,n) \ - ((w) == XFS_DATA_FORK ? \ - ((ip)->i_d.di_nextents = (n)) : \ - ((w) == XFS_ATTR_FORK ? \ - ((ip)->i_d.di_anextents = (n)) : \ - ((ip)->i_cnextents = (n)))) #define XFS_IFORK_MAXEXT(ip, w) \ (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) -#define xfs_ifork_has_extents(ip, w) \ - (XFS_IFORK_FORMAT((ip), (w)) == XFS_DINODE_FMT_EXTENTS || \ - XFS_IFORK_FORMAT((ip), (w)) == XFS_DINODE_FMT_BTREE) +static inline bool xfs_ifork_has_extents(struct xfs_ifork *ifp) +{ + return ifp->if_format == XFS_DINODE_FMT_EXTENTS || + ifp->if_format == XFS_DINODE_FMT_BTREE; +} + +static inline xfs_extnum_t xfs_ifork_nextents(struct xfs_ifork *ifp) +{ + if (!ifp) + return 0; + return ifp->if_nextents; +} + +static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp) +{ + if (!ifp) + return XFS_DINODE_FMT_EXTENTS; + return ifp->if_format; +} struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); -int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); +int xfs_iformat_data_fork(struct xfs_inode *, struct xfs_dinode *); +int xfs_iformat_attr_fork(struct xfs_inode *, struct xfs_dinode *); void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); -void xfs_idestroy_fork(struct xfs_inode *, int); +void xfs_idestroy_fork(struct xfs_ifork *ifp); void xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff, int whichfork); void xfs_iroot_realloc(struct xfs_inode *, int, int); @@ -175,18 +170,7 @@ extern struct kmem_zone *xfs_ifork_zone; extern void xfs_ifork_init_cow(struct xfs_inode *ip); -typedef xfs_failaddr_t (*xfs_ifork_verifier_t)(struct xfs_inode *); - -struct xfs_ifork_ops { - xfs_ifork_verifier_t verify_symlink; - xfs_ifork_verifier_t verify_dir; - xfs_ifork_verifier_t verify_attr; -}; -extern struct xfs_ifork_ops xfs_default_ifork_ops; - -xfs_failaddr_t xfs_ifork_verify_data(struct xfs_inode *ip, - struct xfs_ifork_ops *ops); -xfs_failaddr_t xfs_ifork_verify_attr(struct xfs_inode *ip, - struct xfs_ifork_ops *ops); +int xfs_ifork_verify_local_data(struct xfs_inode *ip); +int xfs_ifork_verify_local_attr(struct xfs_inode *ip); #endif /* __XFS_INODE_FORK_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 3bf671637a91..641132d0e39d 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -7,6 +7,73 @@ #define __XFS_LOG_RECOVER_H__ /* + * Each log item type (XFS_LI_*) gets its own xlog_recover_item_ops to + * define how recovery should work for that type of log item. + */ +struct xlog_recover_item; + +/* Sorting hat for log items as they're read in. */ +enum xlog_recover_reorder { + XLOG_REORDER_BUFFER_LIST, + XLOG_REORDER_ITEM_LIST, + XLOG_REORDER_INODE_BUFFER_LIST, + XLOG_REORDER_CANCEL_LIST, +}; + +struct xlog_recover_item_ops { + uint16_t item_type; /* XFS_LI_* type code. */ + + /* + * Help sort recovered log items into the order required to replay them + * correctly. Log item types that always use XLOG_REORDER_ITEM_LIST do + * not have to supply a function here. See the comment preceding + * xlog_recover_reorder_trans for more details about what the return + * values mean. + */ + enum xlog_recover_reorder (*reorder)(struct xlog_recover_item *item); + + /* Start readahead for pass2, if provided. */ + void (*ra_pass2)(struct xlog *log, struct xlog_recover_item *item); + + /* Do whatever work we need to do for pass1, if provided. */ + int (*commit_pass1)(struct xlog *log, struct xlog_recover_item *item); + + /* + * This function should do whatever work is needed for pass2 of log + * recovery, if provided. + * + * If the recovered item is an intent item, this function should parse + * the recovered item to construct an in-core log intent item and + * insert it into the AIL. The in-core log intent item should have 1 + * refcount so that the item is freed either (a) when we commit the + * recovered log item for the intent-done item; (b) replay the work and + * log a new intent-done item; or (c) recovery fails and we have to + * abort. + * + * If the recovered item is an intent-done item, this function should + * parse the recovered item to find the id of the corresponding intent + * log item. Next, it should find the in-core log intent item in the + * AIL and release it. + */ + int (*commit_pass2)(struct xlog *log, struct list_head *buffer_list, + struct xlog_recover_item *item, xfs_lsn_t lsn); +}; + +extern const struct xlog_recover_item_ops xlog_icreate_item_ops; +extern const struct xlog_recover_item_ops xlog_buf_item_ops; +extern const struct xlog_recover_item_ops xlog_inode_item_ops; +extern const struct xlog_recover_item_ops xlog_dquot_item_ops; +extern const struct xlog_recover_item_ops xlog_quotaoff_item_ops; +extern const struct xlog_recover_item_ops xlog_bui_item_ops; +extern const struct xlog_recover_item_ops xlog_bud_item_ops; +extern const struct xlog_recover_item_ops xlog_efi_item_ops; +extern const struct xlog_recover_item_ops xlog_efd_item_ops; +extern const struct xlog_recover_item_ops xlog_rui_item_ops; +extern const struct xlog_recover_item_ops xlog_rud_item_ops; +extern const struct xlog_recover_item_ops xlog_cui_item_ops; +extern const struct xlog_recover_item_ops xlog_cud_item_ops; + +/* * Macros, structures, prototypes for internal log manager use. */ @@ -22,13 +89,13 @@ /* * item headers are in ri_buf[0]. Additional buffers follow. */ -typedef struct xlog_recover_item { +struct xlog_recover_item { struct list_head ri_list; - int ri_type; int ri_cnt; /* count of regions found */ int ri_total; /* total regions */ - xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */ -} xlog_recover_item_t; + struct xfs_log_iovec *ri_buf; /* ptr to regions buffer */ + const struct xlog_recover_item_ops *ri_ops; +}; struct xlog_recover { struct hlist_node r_list; @@ -51,4 +118,12 @@ struct xlog_recover { #define XLOG_RECOVER_PASS1 1 #define XLOG_RECOVER_PASS2 2 +void xlog_buf_readahead(struct xlog *log, xfs_daddr_t blkno, uint len, + const struct xfs_buf_ops *ops); +bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len); +void xlog_recover_iodone(struct xfs_buf *bp); + +void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, + uint64_t intent_id); + #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index b2113b17e53c..56d9dd787e7b 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -100,7 +100,6 @@ typedef uint16_t xfs_qwarncnt_t; #define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ #define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ #define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ -#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ /* * flags to xfs_trans_mod_dquot to indicate which field needs to be diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index f42c74cb8be5..9498ced947be 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -66,7 +66,7 @@ xfs_rtbuf_get( ip = issum ? mp->m_rsumip : mp->m_rbmip; - error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK); + error = xfs_bmapi_read(ip, block, 1, &map, &nmap, 0); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index c526c5e5ab76..4df87546bd40 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -243,7 +243,7 @@ xfs_validate_sb_common( } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { xfs_notice(mp, -"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits."); +"Superblock earlier than Version 5 has XFS_{P|G}QUOTA_{ENFD|CHKD} bits."); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 3b8260ca7d1b..594bc447a7dd 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -204,16 +204,12 @@ xfs_failaddr_t xfs_symlink_shortform_verify( struct xfs_inode *ip) { - char *sfp; - char *endp; - struct xfs_ifork *ifp; - int size; - - ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL); - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - sfp = (char *)ifp->if_u1.if_data; - size = ifp->if_bytes; - endp = sfp + size; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + char *sfp = (char *)ifp->if_u1.if_data; + int size = ifp->if_bytes; + char *endp = sfp + size; + + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); /* * Zero length symlinks should never occur in memory as they are diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 2b8ccb5b975d..b5dfb6654842 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -27,7 +27,7 @@ xfs_trans_ijoin( struct xfs_inode *ip, uint lock_flags) { - xfs_inode_log_item_t *iip; + struct xfs_inode_log_item *iip; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (ip->i_itemp == NULL) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index add8598eacd5..7badd6dfe544 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -566,8 +566,9 @@ xchk_bmap_check_rmaps( struct xfs_scrub *sc, int whichfork) { - loff_t size; + struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork); xfs_agnumber_t agno; + bool zero_size; int error; if (!xfs_sb_version_hasrmapbt(&sc->mp->m_sb) || @@ -579,6 +580,8 @@ xchk_bmap_check_rmaps( if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK) return 0; + ASSERT(XFS_IFORK_PTR(sc->ip, whichfork) != NULL); + /* * Only do this for complex maps that are in btree format, or for * situations where we would seem to have a size but zero extents. @@ -586,19 +589,14 @@ xchk_bmap_check_rmaps( * to flag this bmap as corrupt if there are rmaps that need to be * reattached. */ - switch (whichfork) { - case XFS_DATA_FORK: - size = i_size_read(VFS_I(sc->ip)); - break; - case XFS_ATTR_FORK: - size = XFS_IFORK_Q(sc->ip); - break; - default: - size = 0; - break; - } - if (XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_BTREE && - (size == 0 || XFS_IFORK_NEXTENTS(sc->ip, whichfork) > 0)) + + if (whichfork == XFS_DATA_FORK) + zero_size = i_size_read(VFS_I(sc->ip)) == 0; + else + zero_size = false; + + if (ifp->if_format != XFS_DINODE_FMT_BTREE && + (zero_size || ifp->if_nextents > 0)) return 0; for (agno = 0; agno < sc->mp->m_sb.sb_agcount; agno++) { @@ -627,12 +625,14 @@ xchk_bmap( struct xchk_bmap_info info = { NULL }; struct xfs_mount *mp = sc->mp; struct xfs_inode *ip = sc->ip; - struct xfs_ifork *ifp; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_fileoff_t endoff; struct xfs_iext_cursor icur; int error = 0; - ifp = XFS_IFORK_PTR(ip, whichfork); + /* Non-existent forks can be ignored. */ + if (!ifp) + goto out; info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip); info.whichfork = whichfork; @@ -641,9 +641,6 @@ xchk_bmap( switch (whichfork) { case XFS_COW_FORK: - /* Non-existent CoW forks are ignorable. */ - if (!ifp) - goto out; /* No CoW forks on non-reflink inodes/filesystems. */ if (!xfs_is_reflink_inode(ip)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); @@ -651,8 +648,6 @@ xchk_bmap( } break; case XFS_ATTR_FORK: - if (!ifp) - goto out_check_rmap; if (!xfs_sb_version_hasattr(&mp->m_sb) && !xfs_sb_version_hasattr2(&mp->m_sb)) xchk_ino_set_corrupt(sc, sc->ip->i_ino); @@ -663,7 +658,7 @@ xchk_bmap( } /* Check the fork values */ - switch (XFS_IFORK_FORMAT(ip, whichfork)) { + switch (ifp->if_format) { case XFS_DINODE_FMT_UUID: case XFS_DINODE_FMT_DEV: case XFS_DINODE_FMT_LOCAL: @@ -717,7 +712,6 @@ xchk_bmap( goto out; } -out_check_rmap: error = xchk_bmap_check_rmaps(sc, whichfork); if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error)) goto out; diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 9a2e27ac1300..44b15015021f 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -468,7 +468,7 @@ xchk_da_btree( int error; /* Skip short format data structures; no btree to scan. */ - if (!xfs_ifork_has_extents(sc->ip, whichfork)) + if (!xfs_ifork_has_extents(XFS_IFORK_PTR(sc->ip, whichfork))) return 0; /* Set up initial da state. */ diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index fe2a6e030c8a..7c432997edad 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -635,7 +635,7 @@ xchk_directory_blocks( { struct xfs_bmbt_irec got; struct xfs_da_args args; - struct xfs_ifork *ifp; + struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK); struct xfs_mount *mp = sc->mp; xfs_fileoff_t leaf_lblk; xfs_fileoff_t free_lblk; @@ -647,11 +647,10 @@ xchk_directory_blocks( int error; /* Ignore local format directories. */ - if (sc->ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && - sc->ip->i_d.di_format != XFS_DINODE_FMT_BTREE) + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS && + ifp->if_format != XFS_DINODE_FMT_BTREE) return 0; - ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK); lblk = XFS_B_TO_FSB(mp, XFS_DIR2_DATA_OFFSET); leaf_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_LEAF_OFFSET); free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET); diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 64c217eb06a7..6517d67e8d51 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -278,8 +278,7 @@ xchk_iallocbt_check_cluster( &XFS_RMAP_OINFO_INODES); /* Grab the inode cluster buffer. */ - error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp, - 0, 0); + error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp, 0); if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error)) return error; diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 5705adc43a75..855aa8bcab64 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -90,7 +90,7 @@ xchk_parent_count_parent_dentries( * if there is one. */ lock_mode = xfs_ilock_data_map_shared(parent); - if (parent->i_d.di_nextents > 0) + if (parent->i_df.if_nextents > 0) error = xfs_dir3_data_readahead(parent, 0, 0); xfs_iunlock(parent, lock_mode); if (error) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 9d9cebf18726..b35611882ff9 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -382,7 +382,7 @@ xfs_map_blocks( */ retry: xfs_ilock(ip, XFS_ILOCK_SHARED); - ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || + ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); /* @@ -621,14 +621,11 @@ xfs_vm_readpage( return iomap_readpage(page, &xfs_read_iomap_ops); } -STATIC int -xfs_vm_readpages( - struct file *unused, - struct address_space *mapping, - struct list_head *pages, - unsigned nr_pages) +STATIC void +xfs_vm_readahead( + struct readahead_control *rac) { - return iomap_readpages(mapping, pages, nr_pages, &xfs_read_iomap_ops); + iomap_readahead(rac, &xfs_read_iomap_ops); } static int @@ -644,7 +641,7 @@ xfs_iomap_swapfile_activate( const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, - .readpages = xfs_vm_readpages, + .readahead = xfs_vm_readahead, .writepage = xfs_vm_writepage, .writepages = xfs_vm_writepages, .set_page_dirty = iomap_set_page_dirty, diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index c42f90e16b4f..bfad669e6b2f 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -367,7 +367,7 @@ xfs_attr_inactive( * removal below. */ if (xfs_inode_hasattr(dp) && - dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) { + dp->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_attr3_root_inactive(&trans, dp); if (error) goto out_cancel; @@ -388,8 +388,11 @@ out_cancel: xfs_trans_cancel(trans); out_destroy_fork: /* kill the in-core attr fork before we drop the inode lock */ - if (dp->i_afp) - xfs_idestroy_fork(dp, XFS_ATTR_FORK); + if (dp->i_afp) { + xfs_idestroy_fork(dp->i_afp); + kmem_cache_free(xfs_ifork_zone, dp->i_afp); + dp->i_afp = NULL; + } if (lock_mode) xfs_iunlock(dp, lock_mode); return error; diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 5ff1d929d3b5..e380bd1a9bfc 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -512,9 +512,9 @@ xfs_attr_list_ilocked( */ if (!xfs_inode_hasattr(dp)) return 0; - else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) + if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_list(context); - else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) return xfs_attr_leaf_list(context); return xfs_attr_node_list(context); } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index ee6f4229cebc..6736c5ab188f 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -22,16 +22,20 @@ #include "xfs_bmap_btree.h" #include "xfs_trans_space.h" #include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" kmem_zone_t *xfs_bui_zone; kmem_zone_t *xfs_bud_zone; +static const struct xfs_item_ops xfs_bui_item_ops; + static inline struct xfs_bui_log_item *BUI_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_bui_log_item, bui_item); } -void +STATIC void xfs_bui_item_free( struct xfs_bui_log_item *buip) { @@ -45,13 +49,13 @@ xfs_bui_item_free( * committed vs unpin operations in bulk insert operations. Hence the reference * count to ensure only the last caller frees the BUI. */ -void +STATIC void xfs_bui_release( struct xfs_bui_log_item *buip) { ASSERT(atomic_read(&buip->bui_refcount) > 0); if (atomic_dec_and_test(&buip->bui_refcount)) { - xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); xfs_bui_item_free(buip); } } @@ -124,17 +128,10 @@ xfs_bui_item_release( xfs_bui_release(BUI_ITEM(lip)); } -static const struct xfs_item_ops xfs_bui_item_ops = { - .iop_size = xfs_bui_item_size, - .iop_format = xfs_bui_item_format, - .iop_unpin = xfs_bui_item_unpin, - .iop_release = xfs_bui_item_release, -}; - /* * Allocate and initialize an bui item with the given number of extents. */ -struct xfs_bui_log_item * +STATIC struct xfs_bui_log_item * xfs_bui_init( struct xfs_mount *mp) @@ -278,27 +275,6 @@ xfs_bmap_update_diff_items( return ba->bi_owner->i_ino - bb->bi_owner->i_ino; } -/* Get an BUI. */ -STATIC void * -xfs_bmap_update_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_bui_log_item *buip; - - ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); - ASSERT(tp != NULL); - - buip = xfs_bui_init(tp->t_mountp); - ASSERT(buip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &buip->bui_item); - return buip; -} - /* Set the map extent flags for this mapping. */ static void xfs_trans_set_bmap_flags( @@ -326,16 +302,12 @@ xfs_trans_set_bmap_flags( STATIC void xfs_bmap_update_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_bui_log_item *buip, + struct xfs_bmap_intent *bmap) { - struct xfs_bui_log_item *buip = intent; - struct xfs_bmap_intent *bmap; uint next_extent; struct xfs_map_extent *map; - bmap = container_of(item, struct xfs_bmap_intent, bi_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); @@ -355,23 +327,44 @@ xfs_bmap_update_log_item( bmap->bi_bmap.br_state); } +static struct xfs_log_item * +xfs_bmap_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_bui_log_item *buip = xfs_bui_init(mp); + struct xfs_bmap_intent *bmap; + + ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); + + xfs_trans_add_item(tp, &buip->bui_item); + if (sort) + list_sort(mp, items, xfs_bmap_update_diff_items); + list_for_each_entry(bmap, items, bi_list) + xfs_bmap_update_log_item(tp, buip, bmap); + return &buip->bui_item; +} + /* Get an BUD so we can process all the deferred rmap updates. */ -STATIC void * +static struct xfs_log_item * xfs_bmap_update_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_bud(tp, intent); + return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item; } /* Process a deferred rmap update. */ STATIC int xfs_bmap_update_finish_item( struct xfs_trans *tp, + struct xfs_log_item *done, struct list_head *item, - void *done_item, - void **state) + struct xfs_btree_cur **state) { struct xfs_bmap_intent *bmap; xfs_filblks_t count; @@ -379,7 +372,7 @@ xfs_bmap_update_finish_item( bmap = container_of(item, struct xfs_bmap_intent, bi_list); count = bmap->bi_bmap.br_blockcount; - error = xfs_trans_log_finish_bmap_update(tp, done_item, + error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bmap->bi_type, bmap->bi_owner, bmap->bi_whichfork, bmap->bi_bmap.br_startoff, @@ -398,9 +391,9 @@ xfs_bmap_update_finish_item( /* Abort all pending BUIs. */ STATIC void xfs_bmap_update_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_bui_release(intent); + xfs_bui_release(BUI_ITEM(intent)); } /* Cancel a deferred rmap update. */ @@ -416,10 +409,8 @@ xfs_bmap_update_cancel_item( const struct xfs_defer_op_type xfs_bmap_update_defer_type = { .max_items = XFS_BUI_MAX_FAST_EXTENTS, - .diff_items = xfs_bmap_update_diff_items, .create_intent = xfs_bmap_update_create_intent, .abort_intent = xfs_bmap_update_abort_intent, - .log_item = xfs_bmap_update_log_item, .create_done = xfs_bmap_update_create_done, .finish_item = xfs_bmap_update_finish_item, .cancel_item = xfs_bmap_update_cancel_item, @@ -429,32 +420,30 @@ const struct xfs_defer_op_type xfs_bmap_update_defer_type = { * Process a bmap update intent item that was recovered from the log. * We need to update some inode's bmbt. */ -int -xfs_bui_recover( - struct xfs_trans *parent_tp, - struct xfs_bui_log_item *buip) +STATIC int +xfs_bui_item_recover( + struct xfs_log_item *lip, + struct xfs_trans *parent_tp) { - int error = 0; - unsigned int bui_type; + struct xfs_bmbt_irec irec; + struct xfs_bui_log_item *buip = BUI_ITEM(lip); + struct xfs_trans *tp; + struct xfs_inode *ip = NULL; + struct xfs_mount *mp = parent_tp->t_mountp; struct xfs_map_extent *bmap; + struct xfs_bud_log_item *budp; xfs_fsblock_t startblock_fsb; xfs_fsblock_t inode_fsb; xfs_filblks_t count; - bool op_ok; - struct xfs_bud_log_item *budp; + xfs_exntst_t state; enum xfs_bmap_intent_type type; + bool op_ok; + unsigned int bui_type; int whichfork; - xfs_exntst_t state; - struct xfs_trans *tp; - struct xfs_inode *ip = NULL; - struct xfs_bmbt_irec irec; - struct xfs_mount *mp = parent_tp->t_mountp; - - ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); + int error = 0; /* Only one mapping operation per BUI... */ if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { - set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); xfs_bui_release(buip); return -EFSCORRUPTED; } @@ -488,7 +477,6 @@ xfs_bui_recover( * This will pull the BUI from the AIL and * free the memory associated with it. */ - set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); xfs_bui_release(buip); return -EFSCORRUPTED; } @@ -546,7 +534,6 @@ xfs_bui_recover( xfs_bmap_unmap_extent(tp, ip, &irec); } - set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); xfs_defer_move(parent_tp, tp); error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -563,3 +550,121 @@ err_inode: } return error; } + +STATIC bool +xfs_bui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return BUI_ITEM(lip)->bui_format.bui_id == intent_id; +} + +static const struct xfs_item_ops xfs_bui_item_ops = { + .iop_size = xfs_bui_item_size, + .iop_format = xfs_bui_item_format, + .iop_unpin = xfs_bui_item_unpin, + .iop_release = xfs_bui_item_release, + .iop_recover = xfs_bui_item_recover, + .iop_match = xfs_bui_item_match, +}; + +/* + * Copy an BUI format buffer from the given buf, and into the destination + * BUI format structure. The BUI/BUD items were designed not to need any + * special alignment handling. + */ +static int +xfs_bui_copy_format( + struct xfs_log_iovec *buf, + struct xfs_bui_log_format *dst_bui_fmt) +{ + struct xfs_bui_log_format *src_bui_fmt; + uint len; + + src_bui_fmt = buf->i_addr; + len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents); + + if (buf->i_len == len) { + memcpy(dst_bui_fmt, src_bui_fmt, len); + return 0; + } + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + return -EFSCORRUPTED; +} + +/* + * This routine is called to create an in-core extent bmap update + * item from the bui format structure which was logged on disk. + * It allocates an in-core bui, copies the extents from the format + * structure into it, and adds the bui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_bui_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_bui_log_item *buip; + struct xfs_bui_log_format *bui_formatp; + + bui_formatp = item->ri_buf[0].i_addr; + + if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + buip = xfs_bui_init(mp); + error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format); + if (error) { + xfs_bui_item_free(buip); + return error; + } + atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); + /* + * Insert the intent into the AIL directly and drop one reference so + * that finishing or canceling the work will drop the other. + */ + xfs_trans_ail_insert(log->l_ailp, &buip->bui_item, lsn); + xfs_bui_release(buip); + return 0; +} + +const struct xlog_recover_item_ops xlog_bui_item_ops = { + .item_type = XFS_LI_BUI, + .commit_pass2 = xlog_recover_bui_commit_pass2, +}; + +/* + * This routine is called when an BUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding BUI if it + * was still in the log. To do this it searches the AIL for the BUI with an id + * equal to that in the BUD format structure. If we find it we drop the BUD + * reference, which removes the BUI from the AIL and frees it. + */ +STATIC int +xlog_recover_bud_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_bud_log_format *bud_formatp; + + bud_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + + xlog_recover_release_intent(log, XFS_LI_BUI, bud_formatp->bud_bui_id); + return 0; +} + +const struct xlog_recover_item_ops xlog_bud_item_ops = { + .item_type = XFS_LI_BUD, + .commit_pass2 = xlog_recover_bud_commit_pass2, +}; diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h index ad479cc73de8..b9be62f8bd52 100644 --- a/fs/xfs/xfs_bmap_item.h +++ b/fs/xfs/xfs_bmap_item.h @@ -33,11 +33,6 @@ struct kmem_zone; #define XFS_BUI_MAX_FAST_EXTENTS 1 /* - * Define BUI flag bits. Manipulated by set/clear/test_bit operators. - */ -#define XFS_BUI_RECOVERED 1 - -/* * This is the "bmap update intent" log item. It is used to log the fact that * some reverse mappings need to change. It is used in conjunction with the * "bmap update done" log item described below. @@ -49,7 +44,6 @@ struct xfs_bui_log_item { struct xfs_log_item bui_item; atomic_t bui_refcount; atomic_t bui_next_extent; - unsigned long bui_flags; /* misc flags */ struct xfs_bui_log_format bui_format; }; @@ -74,9 +68,4 @@ struct xfs_bud_log_item { extern struct kmem_zone *xfs_bui_zone; extern struct kmem_zone *xfs_bud_zone; -struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *); -void xfs_bui_item_free(struct xfs_bui_log_item *); -void xfs_bui_release(struct xfs_bui_log_item *); -int xfs_bui_recover(struct xfs_trans *parent_tp, struct xfs_bui_log_item *buip); - #endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 4f800f7fe888..f37f5cc4b19f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -223,7 +223,7 @@ xfs_bmap_count_blocks( if (!ifp) return 0; - switch (XFS_IFORK_FORMAT(ip, whichfork)) { + switch (ifp->if_format) { case XFS_DINODE_FMT_BTREE: if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(tp, ip, whichfork); @@ -449,7 +449,7 @@ xfs_getbmap( break; } - switch (XFS_IFORK_FORMAT(ip, whichfork)) { + switch (ifp->if_format) { case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: break; @@ -1210,17 +1210,26 @@ xfs_swap_extents_check_format( struct xfs_inode *ip, /* target inode */ struct xfs_inode *tip) /* tmp inode */ { + struct xfs_ifork *ifp = &ip->i_df; + struct xfs_ifork *tifp = &tip->i_df; + + /* User/group/project quota ids must match if quotas are enforced. */ + if (XFS_IS_QUOTA_ON(ip->i_mount) && + (!uid_eq(VFS_I(ip)->i_uid, VFS_I(tip)->i_uid) || + !gid_eq(VFS_I(ip)->i_gid, VFS_I(tip)->i_gid) || + ip->i_d.di_projid != tip->i_d.di_projid)) + return -EINVAL; /* Should never get a local format */ - if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || - tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) + if (ifp->if_format == XFS_DINODE_FMT_LOCAL || + tifp->if_format == XFS_DINODE_FMT_LOCAL) return -EINVAL; /* * if the target inode has less extents that then temporary inode then * why did userspace call us? */ - if (ip->i_d.di_nextents < tip->i_d.di_nextents) + if (ifp->if_nextents < tifp->if_nextents) return -EINVAL; /* @@ -1235,20 +1244,18 @@ xfs_swap_extents_check_format( * form then we will end up with the target inode in the wrong format * as we already know there are less extents in the temp inode. */ - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - tip->i_d.di_format == XFS_DINODE_FMT_BTREE) + if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && + tifp->if_format == XFS_DINODE_FMT_BTREE) return -EINVAL; /* Check temp in extent form to max in target */ - if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > - XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + if (tifp->if_format == XFS_DINODE_FMT_EXTENTS && + tifp->if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) return -EINVAL; /* Check target in extent form to max in temp */ - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > - XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && + ifp->if_nextents > XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return -EINVAL; /* @@ -1260,22 +1267,20 @@ xfs_swap_extents_check_format( * (a common defrag case) which will occur when the temp inode is in * extent format... */ - if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (tifp->if_format == XFS_DINODE_FMT_BTREE) { if (XFS_IFORK_Q(ip) && - XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip)) + XFS_BMAP_BMDR_SPACE(tifp->if_broot) > XFS_IFORK_BOFF(ip)) return -EINVAL; - if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= - XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) return -EINVAL; } /* Reciprocal target->temp btree format checks */ - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { if (XFS_IFORK_Q(tip) && XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) return -EINVAL; - if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= - XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return -EINVAL; } @@ -1427,15 +1432,15 @@ xfs_swap_extent_forks( /* * Count the number of extended attribute blocks */ - if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && - (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + if (XFS_IFORK_Q(ip) && ip->i_afp->if_nextents > 0 && + ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk, &aforkblks); if (error) return error; } - if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && - (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + if (XFS_IFORK_Q(tip) && tip->i_afp->if_nextents > 0 && + tip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk, &taforkblks); if (error) @@ -1450,9 +1455,9 @@ xfs_swap_extent_forks( * bmbt scan as the last step. */ if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) + if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE) (*target_log_flags) |= XFS_ILOG_DOWNER; - if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) + if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE) (*src_log_flags) |= XFS_ILOG_DOWNER; } @@ -1468,9 +1473,6 @@ xfs_swap_extent_forks( ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; - swap(ip->i_d.di_nextents, tip->i_d.di_nextents); - swap(ip->i_d.di_format, tip->i_d.di_format); - /* * The extents in the source inode could still contain speculative * preallocation beyond EOF (e.g. the file is open but not modified @@ -1484,7 +1486,7 @@ xfs_swap_extent_forks( tip->i_delayed_blks = ip->i_delayed_blks; ip->i_delayed_blks = 0; - switch (ip->i_d.di_format) { + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: (*src_log_flags) |= XFS_ILOG_DEXT; break; @@ -1495,7 +1497,7 @@ xfs_swap_extent_forks( break; } - switch (tip->i_d.di_format) { + switch (tip->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: (*target_log_flags) |= XFS_ILOG_DEXT; break; @@ -1606,7 +1608,7 @@ xfs_swap_extents( if (xfs_inode_has_cow_data(tip)) { error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true); if (error) - return error; + goto out_unlock; } /* @@ -1615,9 +1617,9 @@ xfs_swap_extents( * performed with log redo items! */ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { - int w = XFS_DATA_FORK; - uint32_t ipnext = XFS_IFORK_NEXTENTS(ip, w); - uint32_t tipnext = XFS_IFORK_NEXTENTS(tip, w); + int w = XFS_DATA_FORK; + uint32_t ipnext = ip->i_df.if_nextents; + uint32_t tipnext = tip->i_df.if_nextents; /* * Conceptually this shouldn't affect the shape of either bmbt, @@ -1717,10 +1719,11 @@ xfs_swap_extents( /* Swap the cow forks. */ if (xfs_sb_version_hasreflink(&mp->m_sb)) { - ASSERT(ip->i_cformat == XFS_DINODE_FMT_EXTENTS); - ASSERT(tip->i_cformat == XFS_DINODE_FMT_EXTENTS); + ASSERT(!ip->i_cowfp || + ip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS); + ASSERT(!tip->i_cowfp || + tip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS); - swap(ip->i_cnextents, tip->i_cnextents); swap(ip->i_cowfp, tip->i_cowfp); if (ip->i_cowfp && ip->i_cowfp->if_bytes) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 9ec3eaf1c618..20b748f7e186 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -477,7 +477,7 @@ _xfs_buf_map_pages( nofs_flag = memalloc_nofs_save(); do { bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, - -1, PAGE_KERNEL); + -1); if (bp->b_addr) break; vm_unmap_aliases(); @@ -1197,8 +1197,10 @@ xfs_buf_ioend( bp->b_ops->verify_read(bp); } - if (!bp->b_error) + if (!bp->b_error) { + bp->b_flags &= ~XBF_WRITE_FAIL; bp->b_flags |= XBF_DONE; + } if (bp->b_iodone) (*(bp->b_iodone))(bp); @@ -1242,10 +1244,26 @@ xfs_buf_ioerror_alert( struct xfs_buf *bp, xfs_failaddr_t func) { - xfs_alert_ratelimited(bp->b_mount, -"metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", - func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length, - -bp->b_error); + xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error", + "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", + func, (uint64_t)XFS_BUF_ADDR(bp), + bp->b_length, -bp->b_error); +} + +/* + * To simulate an I/O failure, the buffer must be locked and held with at least + * three references. The LRU reference is dropped by the stale call. The buf + * item reference is dropped via ioend processing. The third reference is owned + * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC. + */ +void +xfs_buf_ioend_fail( + struct xfs_buf *bp) +{ + bp->b_flags &= ~XBF_DONE; + xfs_buf_stale(bp); + xfs_buf_ioerror(bp, -EIO); + xfs_buf_ioend(bp); } int @@ -1258,7 +1276,7 @@ xfs_bwrite( bp->b_flags |= XBF_WRITE; bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | - XBF_WRITE_FAIL | XBF_DONE); + XBF_DONE); error = xfs_buf_submit(bp); if (error) @@ -1272,6 +1290,11 @@ xfs_buf_bio_end_io( { struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; + if (!bio->bi_status && + (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && + XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) + bio->bi_status = BLK_STS_IOERR; + /* * don't overwrite existing errors - otherwise we can lose errors on * buffers that require multiple bios to complete. @@ -1480,10 +1503,7 @@ __xfs_buf_submit( /* on shutdown we stale and complete the buffer immediately */ if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { - xfs_buf_ioerror(bp, -EIO); - bp->b_flags &= ~XBF_DONE; - xfs_buf_stale(bp); - xfs_buf_ioend(bp); + xfs_buf_ioend_fail(bp); return -EIO; } @@ -1642,7 +1662,8 @@ xfs_wait_buftarg( struct xfs_buftarg *btp) { LIST_HEAD(dispose); - int loop = 0; + int loop = 0; + bool write_fail = false; /* * First wait on the buftarg I/O count for all in-flight buffers to be @@ -1670,17 +1691,29 @@ xfs_wait_buftarg( bp = list_first_entry(&dispose, struct xfs_buf, b_lru); list_del_init(&bp->b_lru); if (bp->b_flags & XBF_WRITE_FAIL) { - xfs_alert(btp->bt_mount, + write_fail = true; + xfs_buf_alert_ratelimited(bp, + "XFS: Corruption Alert", "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", (long long)bp->b_bn); - xfs_alert(btp->bt_mount, -"Please run xfs_repair to determine the extent of the problem."); } xfs_buf_rele(bp); } if (loop++ != 0) delay(100); } + + /* + * If one or more failed buffers were freed, that means dirty metadata + * was thrown away. This should only ever happen after I/O completion + * handling has elevated I/O error(s) to permanent failures and shuts + * down the fs. + */ + if (write_fail) { + ASSERT(XFS_FORCED_SHUTDOWN(btp->bt_mount)); + xfs_alert(btp->bt_mount, + "Please run xfs_repair to determine the extent of the problem."); + } } static enum lru_status @@ -1813,6 +1846,13 @@ xfs_alloc_buftarg( btp->bt_bdev = bdev; btp->bt_daxdev = dax_dev; + /* + * Buffer IO error rate limiting. Limit it to no more than 10 messages + * per 30 seconds so as to not spam logs too much on repeated errors. + */ + ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, + DEFAULT_RATELIMIT_BURST); + if (xfs_setsize_buftarg_early(btp, bdev)) goto error_free; @@ -1983,7 +2023,7 @@ xfs_buf_delwri_submit_buffers( * synchronously. Otherwise, drop the buffer from the delwri * queue and submit async. */ - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL); + bp->b_flags &= ~_XBF_DELWRI_Q; bp->b_flags |= XBF_WRITE; if (wait_list) { bp->b_flags &= ~XBF_ASYNC; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 9a04c53c2488..050c53b739e2 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -91,6 +91,7 @@ typedef struct xfs_buftarg { struct list_lru bt_lru; struct percpu_counter bt_io_count; + struct ratelimit_state bt_ioerror_rl; } xfs_buftarg_t; struct xfs_buf; @@ -263,6 +264,7 @@ extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa); +void xfs_buf_ioend_fail(struct xfs_buf *); extern int __xfs_buf_submit(struct xfs_buf *bp, bool); static inline int xfs_buf_submit(struct xfs_buf *bp) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 1545657c3ca0..9e75e8d6042e 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -410,7 +410,6 @@ xfs_buf_item_unpin( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); xfs_buf_t *bp = bip->bli_buf; - struct xfs_ail *ailp = lip->li_ailp; int stale = bip->bli_flags & XFS_BLI_STALE; int freed; @@ -452,10 +451,10 @@ xfs_buf_item_unpin( } /* - * If we get called here because of an IO error, we may - * or may not have the item on the AIL. xfs_trans_ail_delete() - * will take care of that situation. - * xfs_trans_ail_delete() drops the AIL lock. + * If we get called here because of an IO error, we may or may + * not have the item on the AIL. xfs_trans_ail_delete() will + * take care of that situation. xfs_trans_ail_delete() drops + * the AIL lock. */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { xfs_buf_do_callbacks(bp); @@ -463,47 +462,23 @@ xfs_buf_item_unpin( list_del_init(&bp->b_li_list); bp->b_iodone = NULL; } else { - spin_lock(&ailp->ail_lock); - xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); ASSERT(bp->b_log_item == NULL); } xfs_buf_relse(bp); } else if (freed && remove) { /* - * There are currently two references to the buffer - the active - * LRU reference and the buf log item. What we are about to do - * here - simulate a failed IO completion - requires 3 - * references. - * - * The LRU reference is removed by the xfs_buf_stale() call. The - * buf item reference is removed by the xfs_buf_iodone() - * callback that is run by xfs_buf_do_callbacks() during ioend - * processing (via the bp->b_iodone callback), and then finally - * the ioend processing will drop the IO reference if the buffer - * is marked XBF_ASYNC. - * - * Hence we need to take an additional reference here so that IO - * completion processing doesn't free the buffer prematurely. + * The buffer must be locked and held by the caller to simulate + * an async I/O failure. */ xfs_buf_lock(bp); xfs_buf_hold(bp); bp->b_flags |= XBF_ASYNC; - xfs_buf_ioerror(bp, -EIO); - bp->b_flags &= ~XBF_DONE; - xfs_buf_stale(bp); - xfs_buf_ioend(bp); + xfs_buf_ioend_fail(bp); } } -/* - * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30 - * seconds so as to not spam logs too much on repeated detection of the same - * buffer being bad.. - */ - -static DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10); - STATIC uint xfs_buf_item_push( struct xfs_log_item *lip, @@ -533,11 +508,10 @@ xfs_buf_item_push( trace_xfs_buf_item_push(bip); /* has a previous flush failed due to IO errors? */ - if ((bp->b_flags & XBF_WRITE_FAIL) && - ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) { - xfs_warn(bp->b_mount, -"Failing async write on buffer block 0x%llx. Retrying async write.", - (long long)bp->b_bn); + if (bp->b_flags & XBF_WRITE_FAIL) { + xfs_buf_alert_ratelimited(bp, "XFS: Failing async write", + "Failing async write on buffer block 0x%llx. Retrying async write.", + (long long)bp->b_bn); } if (!xfs_buf_delwri_queue(bp, buffer_list)) @@ -584,7 +558,7 @@ xfs_buf_item_put( * state. */ if (aborted) - xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(lip, 0); xfs_buf_item_relse(bip->bli_buf); return true; } @@ -1229,61 +1203,19 @@ xfs_buf_iodone( struct xfs_buf *bp, struct xfs_log_item *lip) { - struct xfs_ail *ailp = lip->li_ailp; - ASSERT(BUF_ITEM(lip)->bli_buf == bp); xfs_buf_rele(bp); /* - * If we are forcibly shutting down, this may well be - * off the AIL already. That's because we simulate the - * log-committed callbacks to unpin these buffers. Or we may never - * have put this item on AIL because of the transaction was - * aborted forcibly. xfs_trans_ail_delete() takes care of these. + * If we are forcibly shutting down, this may well be off the AIL + * already. That's because we simulate the log-committed callbacks to + * unpin these buffers. Or we may never have put this item on AIL + * because of the transaction was aborted forcibly. + * xfs_trans_ail_delete() takes care of these. * * Either way, AIL is useless if we're forcing a shutdown. */ - spin_lock(&ailp->ail_lock); - xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); + xfs_trans_ail_delete(lip, SHUTDOWN_CORRUPT_INCORE); xfs_buf_item_free(BUF_ITEM(lip)); } - -/* - * Requeue a failed buffer for writeback. - * - * We clear the log item failed state here as well, but we have to be careful - * about reference counts because the only active reference counts on the buffer - * may be the failed log items. Hence if we clear the log item failed state - * before queuing the buffer for IO we can release all active references to - * the buffer and free it, leading to use after free problems in - * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which - * order we process them in - the buffer is locked, and we own the buffer list - * so nothing on them is going to change while we are performing this action. - * - * Hence we can safely queue the buffer for IO before we clear the failed log - * item state, therefore always having an active reference to the buffer and - * avoiding the transient zero-reference state that leads to use-after-free. - * - * Return true if the buffer was added to the buffer list, false if it was - * already on the buffer list. - */ -bool -xfs_buf_resubmit_failed_buffers( - struct xfs_buf *bp, - struct list_head *buffer_list) -{ - struct xfs_log_item *lip; - bool ret; - - ret = xfs_buf_delwri_queue(bp, buffer_list); - - /* - * XFS_LI_FAILED set/clear is protected by ail_lock, caller of this - * function already have it acquired - */ - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) - xfs_clear_li_failed(lip); - - return ret; -} diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 30114b510332..c9c57e2da932 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -59,8 +59,6 @@ void xfs_buf_attach_iodone(struct xfs_buf *, struct xfs_log_item *); void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); -bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *, - struct list_head *); bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); extern kmem_zone_t *xfs_buf_item_zone; diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c new file mode 100644 index 000000000000..04faa7310c4f --- /dev/null +++ b/fs/xfs/xfs_buf_item_recover.c @@ -0,0 +1,984 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_trans_priv.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" +#include "xfs_error.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_quota.h" + +/* + * This structure is used during recovery to record the buf log items which + * have been canceled and should not be replayed. + */ +struct xfs_buf_cancel { + xfs_daddr_t bc_blkno; + uint bc_len; + int bc_refcount; + struct list_head bc_list; +}; + +static struct xfs_buf_cancel * +xlog_find_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len) +{ + struct list_head *bucket; + struct xfs_buf_cancel *bcp; + + if (!log->l_buf_cancel_table) + return NULL; + + bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); + list_for_each_entry(bcp, bucket, bc_list) { + if (bcp->bc_blkno == blkno && bcp->bc_len == len) + return bcp; + } + + return NULL; +} + +static bool +xlog_add_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len) +{ + struct xfs_buf_cancel *bcp; + + /* + * If we find an existing cancel record, this indicates that the buffer + * was cancelled multiple times. To ensure that during pass 2 we keep + * the record in the table until we reach its last occurrence in the + * log, a reference count is kept to tell how many times we expect to + * see this record during the second pass. + */ + bcp = xlog_find_buffer_cancelled(log, blkno, len); + if (bcp) { + bcp->bc_refcount++; + return false; + } + + bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0); + bcp->bc_blkno = blkno; + bcp->bc_len = len; + bcp->bc_refcount = 1; + list_add_tail(&bcp->bc_list, XLOG_BUF_CANCEL_BUCKET(log, blkno)); + return true; +} + +/* + * Check if there is and entry for blkno, len in the buffer cancel record table. + */ +bool +xlog_is_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len) +{ + return xlog_find_buffer_cancelled(log, blkno, len) != NULL; +} + +/* + * Check if there is and entry for blkno, len in the buffer cancel record table, + * and decremented the reference count on it if there is one. + * + * Remove the cancel record once the refcount hits zero, so that if the same + * buffer is re-used again after its last cancellation we actually replay the + * changes made at that point. + */ +static bool +xlog_put_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len) +{ + struct xfs_buf_cancel *bcp; + + bcp = xlog_find_buffer_cancelled(log, blkno, len); + if (!bcp) { + ASSERT(0); + return false; + } + + if (--bcp->bc_refcount == 0) { + list_del(&bcp->bc_list); + kmem_free(bcp); + } + return true; +} + +/* log buffer item recovery */ + +/* + * Sort buffer items for log recovery. Most buffer items should end up on the + * buffer list and are recovered first, with the following exceptions: + * + * 1. XFS_BLF_CANCEL buffers must be processed last because some log items + * might depend on the incor ecancellation record, and replaying a cancelled + * buffer item can remove the incore record. + * + * 2. XFS_BLF_INODE_BUF buffers are handled after most regular items so that + * we replay di_next_unlinked only after flushing the inode 'free' state + * to the inode buffer. + * + * See xlog_recover_reorder_trans for more details. + */ +STATIC enum xlog_recover_reorder +xlog_recover_buf_reorder( + struct xlog_recover_item *item) +{ + struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + + if (buf_f->blf_flags & XFS_BLF_CANCEL) + return XLOG_REORDER_CANCEL_LIST; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) + return XLOG_REORDER_INODE_BUFFER_LIST; + return XLOG_REORDER_BUFFER_LIST; +} + +STATIC void +xlog_recover_buf_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + + xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL); +} + +/* + * Build up the table of buf cancel records so that we don't replay cancelled + * data in the second pass. + */ +static int +xlog_recover_buf_commit_pass1( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_buf_log_format *bf = item->ri_buf[0].i_addr; + + if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { + xfs_err(log->l_mp, "bad buffer log item size (%d)", + item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + + if (!(bf->blf_flags & XFS_BLF_CANCEL)) + trace_xfs_log_recover_buf_not_cancel(log, bf); + else if (xlog_add_buffer_cancelled(log, bf->blf_blkno, bf->blf_len)) + trace_xfs_log_recover_buf_cancel_add(log, bf); + else + trace_xfs_log_recover_buf_cancel_ref_inc(log, bf); + return 0; +} + +/* + * Validate the recovered buffer is of the correct type and attach the + * appropriate buffer operations to them for writeback. Magic numbers are in a + * few places: + * the first 16 bits of the buffer (inode buffer, dquot buffer), + * the first 32 bits of the buffer (most blocks), + * inside a struct xfs_da_blkinfo at the start of the buffer. + */ +static void +xlog_recover_validate_buf_type( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f, + xfs_lsn_t current_lsn) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + uint32_t magic32; + uint16_t magic16; + uint16_t magicda; + char *warnmsg = NULL; + + /* + * We can only do post recovery validation on items on CRC enabled + * fielsystems as we need to know when the buffer was written to be able + * to determine if we should have replayed the item. If we replay old + * metadata over a newer buffer, then it will enter a temporarily + * inconsistent state resulting in verification failures. Hence for now + * just avoid the verification stage for non-crc filesystems + */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); + magic16 = be16_to_cpu(*(__be16*)bp->b_addr); + magicda = be16_to_cpu(info->magic); + switch (xfs_blft_from_flags(buf_f)) { + case XFS_BLFT_BTREE_BUF: + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTB_MAGIC: + bp->b_ops = &xfs_bnobt_buf_ops; + break; + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTC_MAGIC: + bp->b_ops = &xfs_cntbt_buf_ops; + break; + case XFS_IBT_CRC_MAGIC: + case XFS_IBT_MAGIC: + bp->b_ops = &xfs_inobt_buf_ops; + break; + case XFS_FIBT_CRC_MAGIC: + case XFS_FIBT_MAGIC: + bp->b_ops = &xfs_finobt_buf_ops; + break; + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: + bp->b_ops = &xfs_bmbt_buf_ops; + break; + case XFS_RMAP_CRC_MAGIC: + bp->b_ops = &xfs_rmapbt_buf_ops; + break; + case XFS_REFC_CRC_MAGIC: + bp->b_ops = &xfs_refcountbt_buf_ops; + break; + default: + warnmsg = "Bad btree block magic!"; + break; + } + break; + case XFS_BLFT_AGF_BUF: + if (magic32 != XFS_AGF_MAGIC) { + warnmsg = "Bad AGF block magic!"; + break; + } + bp->b_ops = &xfs_agf_buf_ops; + break; + case XFS_BLFT_AGFL_BUF: + if (magic32 != XFS_AGFL_MAGIC) { + warnmsg = "Bad AGFL block magic!"; + break; + } + bp->b_ops = &xfs_agfl_buf_ops; + break; + case XFS_BLFT_AGI_BUF: + if (magic32 != XFS_AGI_MAGIC) { + warnmsg = "Bad AGI block magic!"; + break; + } + bp->b_ops = &xfs_agi_buf_ops; + break; + case XFS_BLFT_UDQUOT_BUF: + case XFS_BLFT_PDQUOT_BUF: + case XFS_BLFT_GDQUOT_BUF: +#ifdef CONFIG_XFS_QUOTA + if (magic16 != XFS_DQUOT_MAGIC) { + warnmsg = "Bad DQUOT block magic!"; + break; + } + bp->b_ops = &xfs_dquot_buf_ops; +#else + xfs_alert(mp, + "Trying to recover dquots without QUOTA support built in!"); + ASSERT(0); +#endif + break; + case XFS_BLFT_DINO_BUF: + if (magic16 != XFS_DINODE_MAGIC) { + warnmsg = "Bad INODE block magic!"; + break; + } + bp->b_ops = &xfs_inode_buf_ops; + break; + case XFS_BLFT_SYMLINK_BUF: + if (magic32 != XFS_SYMLINK_MAGIC) { + warnmsg = "Bad symlink block magic!"; + break; + } + bp->b_ops = &xfs_symlink_buf_ops; + break; + case XFS_BLFT_DIR_BLOCK_BUF: + if (magic32 != XFS_DIR2_BLOCK_MAGIC && + magic32 != XFS_DIR3_BLOCK_MAGIC) { + warnmsg = "Bad dir block magic!"; + break; + } + bp->b_ops = &xfs_dir3_block_buf_ops; + break; + case XFS_BLFT_DIR_DATA_BUF: + if (magic32 != XFS_DIR2_DATA_MAGIC && + magic32 != XFS_DIR3_DATA_MAGIC) { + warnmsg = "Bad dir data magic!"; + break; + } + bp->b_ops = &xfs_dir3_data_buf_ops; + break; + case XFS_BLFT_DIR_FREE_BUF: + if (magic32 != XFS_DIR2_FREE_MAGIC && + magic32 != XFS_DIR3_FREE_MAGIC) { + warnmsg = "Bad dir3 free magic!"; + break; + } + bp->b_ops = &xfs_dir3_free_buf_ops; + break; + case XFS_BLFT_DIR_LEAF1_BUF: + if (magicda != XFS_DIR2_LEAF1_MAGIC && + magicda != XFS_DIR3_LEAF1_MAGIC) { + warnmsg = "Bad dir leaf1 magic!"; + break; + } + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + break; + case XFS_BLFT_DIR_LEAFN_BUF: + if (magicda != XFS_DIR2_LEAFN_MAGIC && + magicda != XFS_DIR3_LEAFN_MAGIC) { + warnmsg = "Bad dir leafn magic!"; + break; + } + bp->b_ops = &xfs_dir3_leafn_buf_ops; + break; + case XFS_BLFT_DA_NODE_BUF: + if (magicda != XFS_DA_NODE_MAGIC && + magicda != XFS_DA3_NODE_MAGIC) { + warnmsg = "Bad da node magic!"; + break; + } + bp->b_ops = &xfs_da3_node_buf_ops; + break; + case XFS_BLFT_ATTR_LEAF_BUF: + if (magicda != XFS_ATTR_LEAF_MAGIC && + magicda != XFS_ATTR3_LEAF_MAGIC) { + warnmsg = "Bad attr leaf magic!"; + break; + } + bp->b_ops = &xfs_attr3_leaf_buf_ops; + break; + case XFS_BLFT_ATTR_RMT_BUF: + if (magic32 != XFS_ATTR3_RMT_MAGIC) { + warnmsg = "Bad attr remote magic!"; + break; + } + bp->b_ops = &xfs_attr3_rmt_buf_ops; + break; + case XFS_BLFT_SB_BUF: + if (magic32 != XFS_SB_MAGIC) { + warnmsg = "Bad SB block magic!"; + break; + } + bp->b_ops = &xfs_sb_buf_ops; + break; +#ifdef CONFIG_XFS_RT + case XFS_BLFT_RTBITMAP_BUF: + case XFS_BLFT_RTSUMMARY_BUF: + /* no magic numbers for verification of RT buffers */ + bp->b_ops = &xfs_rtbuf_ops; + break; +#endif /* CONFIG_XFS_RT */ + default: + xfs_warn(mp, "Unknown buffer type %d!", + xfs_blft_from_flags(buf_f)); + break; + } + + /* + * Nothing else to do in the case of a NULL current LSN as this means + * the buffer is more recent than the change in the log and will be + * skipped. + */ + if (current_lsn == NULLCOMMITLSN) + return; + + if (warnmsg) { + xfs_warn(mp, warnmsg); + ASSERT(0); + } + + /* + * We must update the metadata LSN of the buffer as it is written out to + * ensure that older transactions never replay over this one and corrupt + * the buffer. This can occur if log recovery is interrupted at some + * point after the current transaction completes, at which point a + * subsequent mount starts recovery from the beginning. + * + * Write verifiers update the metadata LSN from log items attached to + * the buffer. Therefore, initialize a bli purely to carry the LSN to + * the verifier. We'll clean it up in our ->iodone() callback. + */ + if (bp->b_ops) { + struct xfs_buf_log_item *bip; + + ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_item_init(bp, mp); + bip = bp->b_log_item; + bip->bli_item.li_lsn = current_lsn; + } +} + +/* + * Perform a 'normal' buffer recovery. Each logged region of the + * buffer should be copied over the corresponding region in the + * given buffer. The bitmap in the buf log format structure indicates + * where to place the logged data. + */ +STATIC void +xlog_recover_do_reg_buffer( + struct xfs_mount *mp, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f, + xfs_lsn_t current_lsn) +{ + int i; + int bit; + int nbits; + xfs_failaddr_t fa; + const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot); + + trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); + + bit = 0; + i = 1; /* 0 is the buf format structure */ + while (1) { + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + if (bit == -1) + break; + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + ASSERT(nbits > 0); + ASSERT(item->ri_buf[i].i_addr != NULL); + ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); + ASSERT(BBTOB(bp->b_length) >= + ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); + + /* + * The dirty regions logged in the buffer, even though + * contiguous, may span multiple chunks. This is because the + * dirty region may span a physical page boundary in a buffer + * and hence be split into two separate vectors for writing into + * the log. Hence we need to trim nbits back to the length of + * the current region being copied out of the log. + */ + if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) + nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; + + /* + * Do a sanity check if this is a dquot buffer. Just checking + * the first dquot in the buffer should do. XXXThis is + * probably a good thing to do for other buf types also. + */ + fa = NULL; + if (buf_f->blf_flags & + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { + if (item->ri_buf[i].i_addr == NULL) { + xfs_alert(mp, + "XFS: NULL dquot in %s.", __func__); + goto next; + } + if (item->ri_buf[i].i_len < size_disk_dquot) { + xfs_alert(mp, + "XFS: dquot too small (%d) in %s.", + item->ri_buf[i].i_len, __func__); + goto next; + } + fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, + -1, 0); + if (fa) { + xfs_alert(mp, + "dquot corrupt at %pS trying to replay into block 0x%llx", + fa, bp->b_bn); + goto next; + } + } + + memcpy(xfs_buf_offset(bp, + (uint)bit << XFS_BLF_SHIFT), /* dest */ + item->ri_buf[i].i_addr, /* source */ + nbits<<XFS_BLF_SHIFT); /* length */ + next: + i++; + bit += nbits; + } + + /* Shouldn't be any more regions */ + ASSERT(i == item->ri_total); + + xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); +} + +/* + * Perform a dquot buffer recovery. + * Simple algorithm: if we have found a QUOTAOFF log item of the same type + * (ie. USR or GRP), then just toss this buffer away; don't recover it. + * Else, treat it as a regular buffer and do recovery. + * + * Return false if the buffer was tossed and true if we recovered the buffer to + * indicate to the caller if the buffer needs writing. + */ +STATIC bool +xlog_recover_do_dquot_buffer( + struct xfs_mount *mp, + struct xlog *log, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f) +{ + uint type; + + trace_xfs_log_recover_buf_dquot_buf(log, buf_f); + + /* + * Filesystems are required to send in quota flags at mount time. + */ + if (!mp->m_qflags) + return false; + + type = 0; + if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) + type |= XFS_DQ_USER; + if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) + type |= XFS_DQ_PROJ; + if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) + type |= XFS_DQ_GROUP; + /* + * This type of quotas was turned off, so ignore this buffer + */ + if (log->l_quotaoffs_flag & type) + return false; + + xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN); + return true; +} + +/* + * Perform recovery for a buffer full of inodes. In these buffers, the only + * data which should be recovered is that which corresponds to the + * di_next_unlinked pointers in the on disk inode structures. The rest of the + * data for the inodes is always logged through the inodes themselves rather + * than the inode buffer and is recovered in xlog_recover_inode_pass2(). + * + * The only time when buffers full of inodes are fully recovered is when the + * buffer is full of newly allocated inodes. In this case the buffer will + * not be marked as an inode buffer and so will be sent to + * xlog_recover_do_reg_buffer() below during recovery. + */ +STATIC int +xlog_recover_do_inode_buffer( + struct xfs_mount *mp, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f) +{ + int i; + int item_index = 0; + int bit = 0; + int nbits = 0; + int reg_buf_offset = 0; + int reg_buf_bytes = 0; + int next_unlinked_offset; + int inodes_per_buf; + xfs_agino_t *logged_nextp; + xfs_agino_t *buffer_nextp; + + trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); + + /* + * Post recovery validation only works properly on CRC enabled + * filesystems. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + bp->b_ops = &xfs_inode_buf_ops; + + inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog; + for (i = 0; i < inodes_per_buf; i++) { + next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + + offsetof(xfs_dinode_t, di_next_unlinked); + + while (next_unlinked_offset >= + (reg_buf_offset + reg_buf_bytes)) { + /* + * The next di_next_unlinked field is beyond + * the current logged region. Find the next + * logged region that contains or is beyond + * the current di_next_unlinked field. + */ + bit += nbits; + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + + /* + * If there are no more logged regions in the + * buffer, then we're done. + */ + if (bit == -1) + return 0; + + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); + ASSERT(nbits > 0); + reg_buf_offset = bit << XFS_BLF_SHIFT; + reg_buf_bytes = nbits << XFS_BLF_SHIFT; + item_index++; + } + + /* + * If the current logged region starts after the current + * di_next_unlinked field, then move on to the next + * di_next_unlinked field. + */ + if (next_unlinked_offset < reg_buf_offset) + continue; + + ASSERT(item->ri_buf[item_index].i_addr != NULL); + ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); + ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length)); + + /* + * The current logged region contains a copy of the + * current di_next_unlinked field. Extract its value + * and copy it to the buffer copy. + */ + logged_nextp = item->ri_buf[item_index].i_addr + + next_unlinked_offset - reg_buf_offset; + if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) { + xfs_alert(mp, + "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). " + "Trying to replay bad (0) inode di_next_unlinked field.", + item, bp); + return -EFSCORRUPTED; + } + + buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset); + *buffer_nextp = *logged_nextp; + + /* + * If necessary, recalculate the CRC in the on-disk inode. We + * have to leave the inode in a consistent state for whoever + * reads it next.... + */ + xfs_dinode_calc_crc(mp, + xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); + + } + + return 0; +} + +/* + * V5 filesystems know the age of the buffer on disk being recovered. We can + * have newer objects on disk than we are replaying, and so for these cases we + * don't want to replay the current change as that will make the buffer contents + * temporarily invalid on disk. + * + * The magic number might not match the buffer type we are going to recover + * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence + * extract the LSN of the existing object in the buffer based on it's current + * magic number. If we don't recognise the magic number in the buffer, then + * return a LSN of -1 so that the caller knows it was an unrecognised block and + * so can recover the buffer. + * + * Note: we cannot rely solely on magic number matches to determine that the + * buffer has a valid LSN - we also need to verify that it belongs to this + * filesystem, so we need to extract the object's LSN and compare it to that + * which we read from the superblock. If the UUIDs don't match, then we've got a + * stale metadata block from an old filesystem instance that we need to recover + * over the top of. + */ +static xfs_lsn_t +xlog_recover_get_buf_lsn( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + uint32_t magic32; + uint16_t magic16; + uint16_t magicda; + void *blk = bp->b_addr; + uuid_t *uuid; + xfs_lsn_t lsn = -1; + + /* v4 filesystems always recover immediately */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + goto recover_immediately; + + magic32 = be32_to_cpu(*(__be32 *)blk); + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + case XFS_RMAP_CRC_MAGIC: + case XFS_REFC_CRC_MAGIC: + case XFS_IBT_CRC_MAGIC: + case XFS_IBT_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); + uuid = &btb->bb_u.s.bb_uuid; + break; + } + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); + uuid = &btb->bb_u.l.bb_uuid; + break; + } + case XFS_AGF_MAGIC: + lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); + uuid = &((struct xfs_agf *)blk)->agf_uuid; + break; + case XFS_AGFL_MAGIC: + lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); + uuid = &((struct xfs_agfl *)blk)->agfl_uuid; + break; + case XFS_AGI_MAGIC: + lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); + uuid = &((struct xfs_agi *)blk)->agi_uuid; + break; + case XFS_SYMLINK_MAGIC: + lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); + uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; + break; + case XFS_DIR3_BLOCK_MAGIC: + case XFS_DIR3_DATA_MAGIC: + case XFS_DIR3_FREE_MAGIC: + lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); + uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; + break; + case XFS_ATTR3_RMT_MAGIC: + /* + * Remote attr blocks are written synchronously, rather than + * being logged. That means they do not contain a valid LSN + * (i.e. transactionally ordered) in them, and hence any time we + * see a buffer to replay over the top of a remote attribute + * block we should simply do so. + */ + goto recover_immediately; + case XFS_SB_MAGIC: + /* + * superblock uuids are magic. We may or may not have a + * sb_meta_uuid on disk, but it will be set in the in-core + * superblock. We set the uuid pointer for verification + * according to the superblock feature mask to ensure we check + * the relevant UUID in the superblock. + */ + lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); + if (xfs_sb_version_hasmetauuid(&mp->m_sb)) + uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; + else + uuid = &((struct xfs_dsb *)blk)->sb_uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); + switch (magicda) { + case XFS_DIR3_LEAF1_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + case XFS_DA3_NODE_MAGIC: + lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); + uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + /* + * We do individual object checks on dquot and inode buffers as they + * have their own individual LSN records. Also, we could have a stale + * buffer here, so we have to at least recognise these buffer types. + * + * A notd complexity here is inode unlinked list processing - it logs + * the inode directly in the buffer, but we don't know which inodes have + * been modified, and there is no global buffer LSN. Hence we need to + * recover all inode buffer types immediately. This problem will be + * fixed by logical logging of the unlinked list modifications. + */ + magic16 = be16_to_cpu(*(__be16 *)blk); + switch (magic16) { + case XFS_DQUOT_MAGIC: + case XFS_DINODE_MAGIC: + goto recover_immediately; + default: + break; + } + + /* unknown buffer contents, recover immediately */ + +recover_immediately: + return (xfs_lsn_t)-1; + +} + +/* + * This routine replays a modification made to a buffer at runtime. + * There are actually two types of buffer, regular and inode, which + * are handled differently. Inode buffers are handled differently + * in that we only recover a specific set of data from them, namely + * the inode di_next_unlinked fields. This is because all other inode + * data is actually logged via inode records and any data we replay + * here which overlaps that may be stale. + * + * When meta-data buffers are freed at run time we log a buffer item + * with the XFS_BLF_CANCEL bit set to indicate that previous copies + * of the buffer in the log should not be replayed at recovery time. + * This is so that if the blocks covered by the buffer are reused for + * file data before we crash we don't end up replaying old, freed + * meta-data into a user's file. + * + * To handle the cancellation of buffer log items, we make two passes + * over the log during recovery. During the first we build a table of + * those buffers which have been cancelled, and during the second we + * only replay those buffers which do not have corresponding cancel + * records in the table. See xlog_recover_buf_pass[1,2] above + * for more details on the implementation of the table of cancel records. + */ +STATIC int +xlog_recover_buf_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) +{ + struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_mount *mp = log->l_mp; + struct xfs_buf *bp; + int error; + uint buf_flags; + xfs_lsn_t lsn; + + /* + * In this pass we only want to recover all the buffers which have + * not been cancelled and are not cancellation buffers themselves. + */ + if (buf_f->blf_flags & XFS_BLF_CANCEL) { + if (xlog_put_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len)) + goto cancelled; + } else { + + if (xlog_is_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len)) + goto cancelled; + } + + trace_xfs_log_recover_buf_recover(log, buf_f); + + buf_flags = 0; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) + buf_flags |= XBF_UNMAPPED; + + error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, + buf_flags, &bp, NULL); + if (error) + return error; + + /* + * Recover the buffer only if we get an LSN from it and it's less than + * the lsn of the transaction we are replaying. + * + * Note that we have to be extremely careful of readahead here. + * Readahead does not attach verfiers to the buffers so if we don't + * actually do any replay after readahead because of the LSN we found + * in the buffer if more recent than that current transaction then we + * need to attach the verifier directly. Failure to do so can lead to + * future recovery actions (e.g. EFI and unlinked list recovery) can + * operate on the buffers and they won't get the verifier attached. This + * can lead to blocks on disk having the correct content but a stale + * CRC. + * + * It is safe to assume these clean buffers are currently up to date. + * If the buffer is dirtied by a later transaction being replayed, then + * the verifier will be reset to match whatever recover turns that + * buffer into. + */ + lsn = xlog_recover_get_buf_lsn(mp, bp); + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + trace_xfs_log_recover_buf_skip(log, buf_f); + xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); + goto out_release; + } + + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); + if (error) + goto out_release; + } else if (buf_f->blf_flags & + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { + bool dirty; + + dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); + if (!dirty) + goto out_release; + } else { + xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); + } + + /* + * Perform delayed write on the buffer. Asynchronous writes will be + * slower when taking into account all the buffers to be flushed. + * + * Also make sure that only inode buffers with good sizes stay in + * the buffer cache. The kernel moves inodes in buffers of 1 block + * or inode_cluster_size bytes, whichever is bigger. The inode + * buffers in the log can be a different size if the log was generated + * by an older kernel using unclustered inode buffers or a newer kernel + * running with a different inode cluster size. Regardless, if the + * the inode buffer size isn't max(blocksize, inode_cluster_size) + * for *our* value of inode_cluster_size, then we need to keep + * the buffer out of the buffer cache so that the buffer won't + * overlap with future reads of those inodes. + */ + if (XFS_DINODE_MAGIC == + be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && + (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) { + xfs_buf_stale(bp); + error = xfs_bwrite(bp); + } else { + ASSERT(bp->b_mount == mp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); + } + +out_release: + xfs_buf_relse(bp); + return error; +cancelled: + trace_xfs_log_recover_buf_cancel(log, buf_f); + return 0; +} + +const struct xlog_recover_item_ops xlog_buf_item_ops = { + .item_type = XFS_LI_BUF, + .reorder = xlog_recover_buf_reorder, + .ra_pass2 = xlog_recover_buf_ra_pass2, + .commit_pass1 = xlog_recover_buf_commit_pass1, + .commit_pass2 = xlog_recover_buf_commit_pass2, +}; diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 871ec22c9aee..66deddd5e296 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -524,7 +524,7 @@ xfs_readdir( args.geo = dp->i_mount->m_dir_geo; args.trans = tp; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_getdents(&args, ctx); else if ((rval = xfs_dir2_isblock(&args, &v))) ; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index af2c8e5ceea0..d5b7f03e93c8 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -75,7 +75,7 @@ xfs_qm_adjust_dqlimits( int prealloc = 0; ASSERT(d->d_id); - defq = xfs_get_defquota(dq, q); + defq = xfs_get_defquota(q, xfs_dquot_type(dq)); if (defq->bsoftlimit && !d->d_blk_softlimit) { d->d_blk_softlimit = cpu_to_be64(defq->bsoftlimit); @@ -114,9 +114,14 @@ xfs_qm_adjust_dqlimits( void xfs_qm_adjust_dqtimers( struct xfs_mount *mp, - struct xfs_disk_dquot *d) + struct xfs_dquot *dq) { + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct xfs_disk_dquot *d = &dq->q_core; + struct xfs_def_quota *defq; + ASSERT(d->d_id); + defq = xfs_get_defquota(qi, xfs_dquot_type(dq)); #ifdef DEBUG if (d->d_blk_hardlimit) @@ -138,7 +143,7 @@ xfs_qm_adjust_dqtimers( (be64_to_cpu(d->d_bcount) > be64_to_cpu(d->d_blk_hardlimit)))) { d->d_btimer = cpu_to_be32(ktime_get_real_seconds() + - mp->m_quotainfo->qi_btimelimit); + defq->btimelimit); } else { d->d_bwarns = 0; } @@ -161,7 +166,7 @@ xfs_qm_adjust_dqtimers( (be64_to_cpu(d->d_icount) > be64_to_cpu(d->d_ino_hardlimit)))) { d->d_itimer = cpu_to_be32(ktime_get_real_seconds() + - mp->m_quotainfo->qi_itimelimit); + defq->itimelimit); } else { d->d_iwarns = 0; } @@ -184,7 +189,7 @@ xfs_qm_adjust_dqtimers( (be64_to_cpu(d->d_rtbcount) > be64_to_cpu(d->d_rtb_hardlimit)))) { d->d_rtbtimer = cpu_to_be32(ktime_get_real_seconds() + - mp->m_quotainfo->qi_rtbtimelimit); + defq->rtbtimelimit); } else { d->d_rtbwarns = 0; } @@ -205,16 +210,18 @@ xfs_qm_adjust_dqtimers( */ STATIC void xfs_qm_init_dquot_blk( - xfs_trans_t *tp, - xfs_mount_t *mp, - xfs_dqid_t id, - uint type, - xfs_buf_t *bp) + struct xfs_trans *tp, + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + struct xfs_buf *bp) { struct xfs_quotainfo *q = mp->m_quotainfo; - xfs_dqblk_t *d; - xfs_dqid_t curid; - int i; + struct xfs_dqblk *d; + xfs_dqid_t curid; + unsigned int qflag; + unsigned int blftype; + int i; ASSERT(tp); ASSERT(xfs_buf_islocked(bp)); @@ -238,11 +245,39 @@ xfs_qm_init_dquot_blk( } } - xfs_trans_dquot_buf(tp, bp, - (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF : - ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF : - XFS_BLF_GDQUOT_BUF))); - xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); + if (type & XFS_DQ_USER) { + qflag = XFS_UQUOTA_CHKD; + blftype = XFS_BLF_UDQUOT_BUF; + } else if (type & XFS_DQ_PROJ) { + qflag = XFS_PQUOTA_CHKD; + blftype = XFS_BLF_PDQUOT_BUF; + } else { + qflag = XFS_GQUOTA_CHKD; + blftype = XFS_BLF_GDQUOT_BUF; + } + + xfs_trans_dquot_buf(tp, bp, blftype); + + /* + * quotacheck uses delayed writes to update all the dquots on disk in an + * efficient manner instead of logging the individual dquot changes as + * they are made. However if we log the buffer allocated here and crash + * after quotacheck while the logged initialisation is still in the + * active region of the log, log recovery can replay the dquot buffer + * initialisation over the top of the checked dquots and corrupt quota + * accounting. + * + * To avoid this problem, quotacheck cannot log the initialised buffer. + * We must still dirty the buffer and write it back before the + * allocation transaction clears the log. Therefore, mark the buffer as + * ordered instead of logging it directly. This is safe for quotacheck + * because it detects and repairs allocated but initialized dquot blocks + * in the quota inodes. + */ + if (!(mp->m_qflags & qflag)) + xfs_trans_ordered_buf(tp, bp); + else + xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); } /* @@ -1021,6 +1056,7 @@ xfs_qm_dqflush_done( struct xfs_dq_logitem *qip = (struct xfs_dq_logitem *)lip; struct xfs_dquot *dqp = qip->qli_dquot; struct xfs_ail *ailp = lip->li_ailp; + xfs_lsn_t tail_lsn; /* * We only want to pull the item from the AIL if its @@ -1034,10 +1070,11 @@ xfs_qm_dqflush_done( ((lip->li_lsn == qip->qli_flush_lsn) || test_bit(XFS_LI_FAILED, &lip->li_flags))) { - /* xfs_trans_ail_delete() drops the AIL lock. */ spin_lock(&ailp->ail_lock); if (lip->li_lsn == qip->qli_flush_lsn) { - xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); + /* xfs_ail_update_finish() drops the AIL lock */ + tail_lsn = xfs_ail_delete_one(ailp, lip); + xfs_ail_update_finish(ailp, tail_lsn); } else { /* * Clear the failed state since we are about to drop the @@ -1068,6 +1105,7 @@ xfs_qm_dqflush( struct xfs_buf **bpp) { struct xfs_mount *mp = dqp->q_mount; + struct xfs_log_item *lip = &dqp->q_logitem.qli_item; struct xfs_buf *bp; struct xfs_dqblk *dqb; struct xfs_disk_dquot *ddqp; @@ -1084,31 +1122,15 @@ xfs_qm_dqflush( xfs_qm_dqunpin_wait(dqp); /* - * This may have been unpinned because the filesystem is shutting - * down forcibly. If that's the case we must not write this dquot - * to disk, because the log record didn't make it to disk. - * - * We also have to remove the log item from the AIL in this case, - * as we wait for an emptry AIL as part of the unmount process. - */ - if (XFS_FORCED_SHUTDOWN(mp)) { - struct xfs_log_item *lip = &dqp->q_logitem.qli_item; - dqp->dq_flags &= ~XFS_DQ_DIRTY; - - xfs_trans_ail_remove(lip, SHUTDOWN_CORRUPT_INCORE); - - error = -EIO; - goto out_unlock; - } - - /* * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK, &bp, &xfs_dquot_buf_ops); - if (error) + if (error == -EAGAIN) goto out_unlock; + if (error) + goto out_abort; /* * Calculate the location of the dquot inside the buffer. @@ -1116,17 +1138,15 @@ xfs_qm_dqflush( dqb = bp->b_addr + dqp->q_bufoffset; ddqp = &dqb->dd_diskdq; - /* - * A simple sanity check in case we got a corrupted dquot. - */ - fa = xfs_dqblk_verify(mp, dqb, be32_to_cpu(ddqp->d_id), 0); + /* sanity check the in-core structure before we flush */ + fa = xfs_dquot_verify(mp, &dqp->q_core, be32_to_cpu(dqp->q_core.d_id), + 0); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", - be32_to_cpu(ddqp->d_id), fa); + be32_to_cpu(dqp->q_core.d_id), fa); xfs_buf_relse(bp); - xfs_dqfunlock(dqp); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - return -EFSCORRUPTED; + error = -EFSCORRUPTED; + goto out_abort; } /* This is the only portion of data that needs to persist */ @@ -1175,6 +1195,10 @@ xfs_qm_dqflush( *bpp = bp; return 0; +out_abort: + dqp->dq_flags &= ~XFS_DQ_DIRTY; + xfs_trans_ail_delete(lip, 0); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); out_unlock: xfs_dqfunlock(dqp); return error; diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index fe3e46df604b..71e36c85e20b 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -154,7 +154,7 @@ void xfs_qm_dqdestroy(struct xfs_dquot *dqp); int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp); void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp); void xfs_qm_adjust_dqtimers(struct xfs_mount *mp, - struct xfs_disk_dquot *d); + struct xfs_dquot *d); void xfs_qm_adjust_dqlimits(struct xfs_mount *mp, struct xfs_dquot *d); xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, uint type); diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index baad1748d0d1..349c92d26570 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -145,21 +145,6 @@ xfs_qm_dquot_logitem_push( if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; - /* - * The buffer containing this item failed to be written back - * previously. Resubmit the buffer for IO - */ - if (test_bit(XFS_LI_FAILED, &lip->li_flags)) { - if (!xfs_buf_trylock(bp)) - return XFS_ITEM_LOCKED; - - if (!xfs_buf_resubmit_failed_buffers(bp, buffer_list)) - rval = XFS_ITEM_FLUSHING; - - xfs_buf_unlock(bp); - return rval; - } - if (!xfs_dqlock_nowait(dqp)) return XFS_ITEM_LOCKED; @@ -358,7 +343,7 @@ xfs_qm_qoff_logitem_relse( ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags) || test_bit(XFS_LI_ABORTED, &lip->li_flags) || XFS_FORCED_SHUTDOWN(lip->li_mountp)); - xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(lip, 0); kmem_free(lip->li_lv_shadow); kmem_free(qoff); } diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c new file mode 100644 index 000000000000..3400be4c88f0 --- /dev/null +++ b/fs/xfs/xfs_dquot_item_recover.c @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_trans_priv.h" +#include "xfs_qm.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" + +STATIC void +xlog_recover_dquot_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_disk_dquot *recddq; + struct xfs_dq_logformat *dq_f; + uint type; + + if (mp->m_qflags == 0) + return; + + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) + return; + if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) + return; + + type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + ASSERT(type); + if (log->l_quotaoffs_flag & type) + return; + + dq_f = item->ri_buf[0].i_addr; + ASSERT(dq_f); + ASSERT(dq_f->qlf_len == 1); + + xlog_buf_readahead(log, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), + &xfs_dquot_buf_ra_ops); +} + +/* + * Recover a dquot record + */ +STATIC int +xlog_recover_dquot_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_buf *bp; + struct xfs_disk_dquot *ddq, *recddq; + struct xfs_dq_logformat *dq_f; + xfs_failaddr_t fa; + int error; + uint type; + + /* + * Filesystems are required to send in quota flags at mount time. + */ + if (mp->m_qflags == 0) + return 0; + + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) { + xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); + return -EFSCORRUPTED; + } + if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) { + xfs_alert(log->l_mp, "dquot too small (%d) in %s.", + item->ri_buf[1].i_len, __func__); + return -EFSCORRUPTED; + } + + /* + * This type of quotas was turned off, so ignore this record. + */ + type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + ASSERT(type); + if (log->l_quotaoffs_flag & type) + return 0; + + /* + * At this point we know that quota was _not_ turned off. + * Since the mount flags are not indicating to us otherwise, this + * must mean that quota is on, and the dquot needs to be replayed. + * Remember that we may not have fully recovered the superblock yet, + * so we can't do the usual trick of looking at the SB quota bits. + * + * The other possibility, of course, is that the quota subsystem was + * removed since the last mount - ENOSYS. + */ + dq_f = item->ri_buf[0].i_addr; + ASSERT(dq_f); + fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0); + if (fa) { + xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS", + dq_f->qlf_id, fa); + return -EFSCORRUPTED; + } + ASSERT(dq_f->qlf_len == 1); + + /* + * At this point we are assuming that the dquots have been allocated + * and hence the buffer has valid dquots stamped in it. It should, + * therefore, pass verifier validation. If the dquot is bad, then the + * we'll return an error here, so we don't need to specifically check + * the dquot in the buffer after the verifier has run. + */ + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, + &xfs_dquot_buf_ops); + if (error) + return error; + + ASSERT(bp); + ddq = xfs_buf_offset(bp, dq_f->qlf_boffset); + + /* + * If the dquot has an LSN in it, recover the dquot only if it's less + * than the lsn of the transaction we are replaying. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; + xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + goto out_release; + } + } + + memcpy(ddq, recddq, item->ri_buf[1].i_len); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + + ASSERT(dq_f->qlf_size == 2); + ASSERT(bp->b_mount == mp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); + +out_release: + xfs_buf_relse(bp); + return 0; +} + +const struct xlog_recover_item_ops xlog_dquot_item_ops = { + .item_type = XFS_LI_DQUOT, + .ra_pass2 = xlog_recover_dquot_ra_pass2, + .commit_pass2 = xlog_recover_dquot_commit_pass2, +}; + +/* + * Recover QUOTAOFF records. We simply make a note of it in the xlog + * structure, so that we know not to do any dquot item or dquot buffer recovery, + * of that type. + */ +STATIC int +xlog_recover_quotaoff_commit_pass1( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_qoff_logformat *qoff_f = item->ri_buf[0].i_addr; + ASSERT(qoff_f); + + /* + * The logitem format's flag tells us if this was user quotaoff, + * group/project quotaoff or both. + */ + if (qoff_f->qf_flags & XFS_UQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQ_USER; + if (qoff_f->qf_flags & XFS_PQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQ_PROJ; + if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQ_GROUP; + + return 0; +} + +const struct xlog_recover_item_ops xlog_quotaoff_item_ops = { + .item_type = XFS_LI_QUOTAOFF, + .commit_pass1 = xlog_recover_quotaoff_commit_pass1, + /* nothing to commit in pass2 */ +}; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index a21e9cc6516a..7f6e20899473 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -53,6 +53,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_FORCE_SCRUB_REPAIR, XFS_RANDOM_FORCE_SUMMARY_RECALC, XFS_RANDOM_IUNLINK_FALLBACK, + XFS_RANDOM_BUF_IOERROR, }; struct xfs_errortag_attr { @@ -162,6 +163,7 @@ XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); +XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -199,6 +201,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(force_repair), XFS_ERRORTAG_ATTR_LIST(bad_summary), XFS_ERRORTAG_ATTR_LIST(iunlink_fallback), + XFS_ERRORTAG_ATTR_LIST(buf_ioerror), NULL, }; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 6ea847f6e298..b9c333bae0a1 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -22,16 +22,20 @@ #include "xfs_bmap.h" #include "xfs_trace.h" #include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" kmem_zone_t *xfs_efi_zone; kmem_zone_t *xfs_efd_zone; +static const struct xfs_item_ops xfs_efi_item_ops; + static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_efi_log_item, efi_item); } -void +STATIC void xfs_efi_item_free( struct xfs_efi_log_item *efip) { @@ -49,13 +53,13 @@ xfs_efi_item_free( * committed vs unpin operations in bulk insert operations. Hence the reference * count to ensure only the last caller frees the EFI. */ -void +STATIC void xfs_efi_release( struct xfs_efi_log_item *efip) { ASSERT(atomic_read(&efip->efi_refcount) > 0); if (atomic_dec_and_test(&efip->efi_refcount)) { - xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); xfs_efi_item_free(efip); } } @@ -139,18 +143,10 @@ xfs_efi_item_release( xfs_efi_release(EFI_ITEM(lip)); } -static const struct xfs_item_ops xfs_efi_item_ops = { - .iop_size = xfs_efi_item_size, - .iop_format = xfs_efi_item_format, - .iop_unpin = xfs_efi_item_unpin, - .iop_release = xfs_efi_item_release, -}; - - /* * Allocate and initialize an efi item with the given number of extents. */ -struct xfs_efi_log_item * +STATIC struct xfs_efi_log_item * xfs_efi_init( struct xfs_mount *mp, uint nextents) @@ -161,7 +157,7 @@ xfs_efi_init( ASSERT(nextents > 0); if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { - size = (uint)(sizeof(xfs_efi_log_item_t) + + size = (uint)(sizeof(struct xfs_efi_log_item) + ((nextents - 1) * sizeof(xfs_extent_t))); efip = kmem_zalloc(size, 0); } else { @@ -184,7 +180,7 @@ xfs_efi_init( * one of which will be the native format for this kernel. * It will handle the conversion of formats if necessary. */ -int +STATIC int xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) { xfs_efi_log_format_t *src_efi_fmt = buf->i_addr; @@ -412,41 +408,16 @@ xfs_extent_free_diff_items( XFS_FSB_TO_AGNO(mp, rb->xefi_startblock); } -/* Get an EFI. */ -STATIC void * -xfs_extent_free_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_efi_log_item *efip; - - ASSERT(tp != NULL); - ASSERT(count > 0); - - efip = xfs_efi_init(tp->t_mountp, count); - ASSERT(efip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &efip->efi_item); - return efip; -} - /* Log a free extent to the intent item. */ STATIC void xfs_extent_free_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_efi_log_item *efip, + struct xfs_extent_free_item *free) { - struct xfs_efi_log_item *efip = intent; - struct xfs_extent_free_item *free; uint next_extent; struct xfs_extent *extp; - free = container_of(item, struct xfs_extent_free_item, xefi_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); @@ -462,29 +433,50 @@ xfs_extent_free_log_item( extp->ext_len = free->xefi_blockcount; } +static struct xfs_log_item * +xfs_extent_free_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_efi_log_item *efip = xfs_efi_init(mp, count); + struct xfs_extent_free_item *free; + + ASSERT(count > 0); + + xfs_trans_add_item(tp, &efip->efi_item); + if (sort) + list_sort(mp, items, xfs_extent_free_diff_items); + list_for_each_entry(free, items, xefi_list) + xfs_extent_free_log_item(tp, efip, free); + return &efip->efi_item; +} + /* Get an EFD so we can process all the free extents. */ -STATIC void * +static struct xfs_log_item * xfs_extent_free_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_efd(tp, intent, count); + return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item; } /* Process a free extent. */ STATIC int xfs_extent_free_finish_item( struct xfs_trans *tp, + struct xfs_log_item *done, struct list_head *item, - void *done_item, - void **state) + struct xfs_btree_cur **state) { struct xfs_extent_free_item *free; int error; free = container_of(item, struct xfs_extent_free_item, xefi_list); - error = xfs_trans_free_extent(tp, done_item, + error = xfs_trans_free_extent(tp, EFD_ITEM(done), free->xefi_startblock, free->xefi_blockcount, &free->xefi_oinfo, free->xefi_skip_discard); @@ -495,9 +487,9 @@ xfs_extent_free_finish_item( /* Abort all pending EFIs. */ STATIC void xfs_extent_free_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_efi_release(intent); + xfs_efi_release(EFI_ITEM(intent)); } /* Cancel a free extent. */ @@ -513,10 +505,8 @@ xfs_extent_free_cancel_item( const struct xfs_defer_op_type xfs_extent_free_defer_type = { .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .diff_items = xfs_extent_free_diff_items, .create_intent = xfs_extent_free_create_intent, .abort_intent = xfs_extent_free_abort_intent, - .log_item = xfs_extent_free_log_item, .create_done = xfs_extent_free_create_done, .finish_item = xfs_extent_free_finish_item, .cancel_item = xfs_extent_free_cancel_item, @@ -529,12 +519,12 @@ const struct xfs_defer_op_type xfs_extent_free_defer_type = { STATIC int xfs_agfl_free_finish_item( struct xfs_trans *tp, + struct xfs_log_item *done, struct list_head *item, - void *done_item, - void **state) + struct xfs_btree_cur **state) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_efd_log_item *efdp = done_item; + struct xfs_efd_log_item *efdp = EFD_ITEM(done); struct xfs_extent_free_item *free; struct xfs_extent *extp; struct xfs_buf *agbp; @@ -579,10 +569,8 @@ xfs_agfl_free_finish_item( /* sub-type with special handling for AGFL deferred frees */ const struct xfs_defer_op_type xfs_agfl_free_defer_type = { .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .diff_items = xfs_extent_free_diff_items, .create_intent = xfs_extent_free_create_intent, .abort_intent = xfs_extent_free_abort_intent, - .log_item = xfs_extent_free_log_item, .create_done = xfs_extent_free_create_done, .finish_item = xfs_agfl_free_finish_item, .cancel_item = xfs_extent_free_cancel_item, @@ -592,19 +580,19 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = { * Process an extent free intent item that was recovered from * the log. We need to free the extents that it describes. */ -int -xfs_efi_recover( - struct xfs_mount *mp, - struct xfs_efi_log_item *efip) +STATIC int +xfs_efi_item_recover( + struct xfs_log_item *lip, + struct xfs_trans *parent_tp) { - struct xfs_efd_log_item *efdp; - struct xfs_trans *tp; - int i; - int error = 0; - xfs_extent_t *extp; - xfs_fsblock_t startblock_fsb; - - ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_efd_log_item *efdp; + struct xfs_trans *tp; + struct xfs_extent *extp; + xfs_fsblock_t startblock_fsb; + int i; + int error = 0; /* * First check the validity of the extents described by the @@ -623,7 +611,6 @@ xfs_efi_recover( * This will pull the EFI from the AIL and * free the memory associated with it. */ - set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); xfs_efi_release(efip); return -EFSCORRUPTED; } @@ -644,7 +631,6 @@ xfs_efi_recover( } - set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); error = xfs_trans_commit(tp); return error; @@ -652,3 +638,93 @@ abort_error: xfs_trans_cancel(tp); return error; } + +STATIC bool +xfs_efi_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return EFI_ITEM(lip)->efi_format.efi_id == intent_id; +} + +static const struct xfs_item_ops xfs_efi_item_ops = { + .iop_size = xfs_efi_item_size, + .iop_format = xfs_efi_item_format, + .iop_unpin = xfs_efi_item_unpin, + .iop_release = xfs_efi_item_release, + .iop_recover = xfs_efi_item_recover, + .iop_match = xfs_efi_item_match, +}; + +/* + * This routine is called to create an in-core extent free intent + * item from the efi format structure which was logged on disk. + * It allocates an in-core efi, copies the extents from the format + * structure into it, and adds the efi to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_efi_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_efi_log_item *efip; + struct xfs_efi_log_format *efi_formatp; + int error; + + efi_formatp = item->ri_buf[0].i_addr; + + efip = xfs_efi_init(mp, efi_formatp->efi_nextents); + error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); + if (error) { + xfs_efi_item_free(efip); + return error; + } + atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); + /* + * Insert the intent into the AIL directly and drop one reference so + * that finishing or canceling the work will drop the other. + */ + xfs_trans_ail_insert(log->l_ailp, &efip->efi_item, lsn); + xfs_efi_release(efip); + return 0; +} + +const struct xlog_recover_item_ops xlog_efi_item_ops = { + .item_type = XFS_LI_EFI, + .commit_pass2 = xlog_recover_efi_commit_pass2, +}; + +/* + * This routine is called when an EFD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding EFI if it + * was still in the log. To do this it searches the AIL for the EFI with an id + * equal to that in the EFD format structure. If we find it we drop the EFD + * reference, which removes the EFI from the AIL and frees it. + */ +STATIC int +xlog_recover_efd_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_efd_log_format *efd_formatp; + + efd_formatp = item->ri_buf[0].i_addr; + ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || + (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t))))); + + xlog_recover_release_intent(log, XFS_LI_EFI, efd_formatp->efd_efi_id); + return 0; +} + +const struct xlog_recover_item_ops xlog_efd_item_ops = { + .item_type = XFS_LI_EFD, + .commit_pass2 = xlog_recover_efd_commit_pass2, +}; diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 16aaab06d4ec..cd2860c875bf 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -17,11 +17,6 @@ struct kmem_zone; #define XFS_EFI_MAX_FAST_EXTENTS 16 /* - * Define EFI flag bits. Manipulated by set/clear/test_bit operators. - */ -#define XFS_EFI_RECOVERED 1 - -/* * This is the "extent free intention" log item. It is used to log the fact * that some extents need to be free. It is used in conjunction with the * "extent free done" log item described below. @@ -50,25 +45,24 @@ struct kmem_zone; * of commit failure or log I/O errors. Note that the EFD is not inserted in the * AIL, so at this point both the EFI and EFD are freed. */ -typedef struct xfs_efi_log_item { +struct xfs_efi_log_item { struct xfs_log_item efi_item; atomic_t efi_refcount; atomic_t efi_next_extent; - unsigned long efi_flags; /* misc flags */ xfs_efi_log_format_t efi_format; -} xfs_efi_log_item_t; +}; /* * This is the "extent free done" log item. It is used to log * the fact that some extents earlier mentioned in an efi item * have been freed. */ -typedef struct xfs_efd_log_item { +struct xfs_efd_log_item { struct xfs_log_item efd_item; - xfs_efi_log_item_t *efd_efip; + struct xfs_efi_log_item *efd_efip; uint efd_next_extent; xfs_efd_log_format_t efd_format; -} xfs_efd_log_item_t; +}; /* * Max number of extents in fast allocation path. @@ -78,13 +72,4 @@ typedef struct xfs_efd_log_item { extern struct kmem_zone *xfs_efi_zone; extern struct kmem_zone *xfs_efd_zone; -xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint); -int xfs_efi_copy_format(xfs_log_iovec_t *buf, - xfs_efi_log_format_t *dst_efi_fmt); -void xfs_efi_item_free(xfs_efi_log_item_t *); -void xfs_efi_release(struct xfs_efi_log_item *); - -int xfs_efi_recover(struct xfs_mount *mp, - struct xfs_efi_log_item *efip); - #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 4b8bdecc3863..403c90309a8f 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1102,7 +1102,7 @@ xfs_dir_open( * certain to have the next operation be a read there. */ mode = xfs_ilock_data_map_shared(ip); - if (ip->i_d.di_nextents > 0) + if (ip->i_df.if_nextents > 0) error = xfs_dir3_data_readahead(ip, 0, 0); xfs_iunlock(ip, mode); return error; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 3e61d0cc23f8..ef1d5bb88b93 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -504,10 +504,7 @@ xfs_do_force_shutdown( } else if (logerror) { xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, "Log I/O Error Detected. Shutting down filesystem"); - } else if (flags & SHUTDOWN_DEVICE_REQ) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "All device paths lost. Shutting down filesystem"); - } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { + } else { xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, "I/O Error Detected. Shutting down filesystem"); } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 8bf1d15be3f6..5daef654956c 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -22,6 +22,7 @@ #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_reflink.h" +#include "xfs_ialloc.h" #include <linux/iversion.h> @@ -62,8 +63,6 @@ xfs_inode_alloc( memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); ip->i_afp = NULL; ip->i_cowfp = NULL; - ip->i_cnextents = 0; - ip->i_cformat = XFS_DINODE_FMT_EXTENTS; memset(&ip->i_df, 0, sizeof(ip->i_df)); ip->i_flags = 0; ip->i_delayed_blks = 0; @@ -88,15 +87,18 @@ xfs_inode_free_callback( case S_IFREG: case S_IFDIR: case S_IFLNK: - xfs_idestroy_fork(ip, XFS_DATA_FORK); + xfs_idestroy_fork(&ip->i_df); break; } - if (ip->i_afp) - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - if (ip->i_cowfp) - xfs_idestroy_fork(ip, XFS_COW_FORK); - + if (ip->i_afp) { + xfs_idestroy_fork(ip->i_afp); + kmem_cache_free(xfs_ifork_zone, ip->i_afp); + } + if (ip->i_cowfp) { + xfs_idestroy_fork(ip->i_cowfp); + kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); + } if (ip->i_itemp) { ASSERT(!test_bit(XFS_LI_IN_AIL, &ip->i_itemp->ili_item.li_flags)); @@ -423,6 +425,7 @@ xfs_iget_cache_hit( spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); + ASSERT(!rwsem_is_locked(&inode->i_rwsem)); error = xfs_reinit_inode(mp, inode); if (error) { bool wake; @@ -456,9 +459,6 @@ xfs_iget_cache_hit( ip->i_sick = 0; ip->i_checked = 0; - ASSERT(!rwsem_is_locked(&inode->i_rwsem)); - init_rwsem(&inode->i_rwsem); - spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); } else { @@ -479,7 +479,7 @@ xfs_iget_cache_hit( xfs_ilock(ip, lock_flags); if (!(flags & XFS_IGET_INCORE)) - xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); + xfs_iflags_clear(ip, XFS_ISTALE); XFS_STATS_INC(mp, xs_ig_found); return 0; @@ -510,18 +510,42 @@ xfs_iget_cache_miss( if (!ip) return -ENOMEM; - error = xfs_iread(mp, tp, ip, flags); + error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags); if (error) goto out_destroy; - if (!xfs_inode_verify_forks(ip)) { - error = -EFSCORRUPTED; - goto out_destroy; + /* + * For version 5 superblocks, if we are initialising a new inode and we + * are not utilising the XFS_MOUNT_IKEEP inode cluster mode, we can + * simply build the new inode core with a random generation number. + * + * For version 4 (and older) superblocks, log recovery is dependent on + * the di_flushiter field being initialised from the current on-disk + * value and hence we must also read the inode off disk even when + * initializing new inodes. + */ + if (xfs_sb_version_has_v3inode(&mp->m_sb) && + (flags & XFS_IGET_CREATE) && !(mp->m_flags & XFS_MOUNT_IKEEP)) { + VFS_I(ip)->i_generation = prandom_u32(); + } else { + struct xfs_dinode *dip; + struct xfs_buf *bp; + + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0); + if (error) + goto out_destroy; + + error = xfs_inode_from_disk(ip, dip); + if (!error) + xfs_buf_set_ref(bp, XFS_INO_REF); + xfs_trans_brelse(tp, bp); + + if (error) + goto out_destroy; } trace_xfs_iget_miss(ip); - /* * Check the inode free state is valid. This also detects lookup * racing with unlinks. @@ -561,7 +585,7 @@ xfs_iget_cache_miss( */ iflags = XFS_INEW; if (flags & XFS_IGET_DONTCACHE) - iflags |= XFS_IDONTCACHE; + d_mark_dontcache(VFS_I(ip)); ip->i_udquot = NULL; ip->i_gdquot = NULL; ip->i_pdquot = NULL; @@ -737,13 +761,18 @@ xfs_icache_inode_is_allocated( */ #define XFS_LOOKUP_BATCH 32 -STATIC int -xfs_inode_ag_walk_grab( +/* + * Decide if the given @ip is eligible to be a part of the inode walk, and + * grab it if so. Returns true if it's ready to go or false if we should just + * ignore it. + */ +STATIC bool +xfs_inode_walk_ag_grab( struct xfs_inode *ip, int flags) { struct inode *inode = VFS_I(ip); - bool newinos = !!(flags & XFS_AGITER_INEW_WAIT); + bool newinos = !!(flags & XFS_INODE_WALK_INEW_WAIT); ASSERT(rcu_read_lock_held()); @@ -768,39 +797,41 @@ xfs_inode_ag_walk_grab( /* nothing to sync during shutdown */ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EFSCORRUPTED; + return false; /* If we can't grab the inode, it must on it's way to reclaim. */ if (!igrab(inode)) - return -ENOENT; + return false; /* inode is valid */ - return 0; + return true; out_unlock_noent: spin_unlock(&ip->i_flags_lock); - return -ENOENT; + return false; } +/* + * For a given per-AG structure @pag, grab, @execute, and rele all incore + * inodes with the given radix tree @tag. + */ STATIC int -xfs_inode_ag_walk( - struct xfs_mount *mp, +xfs_inode_walk_ag( struct xfs_perag *pag, - int (*execute)(struct xfs_inode *ip, int flags, - void *args), - int flags, + int iter_flags, + int (*execute)(struct xfs_inode *ip, void *args), void *args, - int tag, - int iter_flags) + int tag) { + struct xfs_mount *mp = pag->pag_mount; uint32_t first_index; int last_error = 0; int skipped; - int done; + bool done; int nr_found; restart: - done = 0; + done = false; skipped = 0; first_index = 0; nr_found = 0; @@ -811,7 +842,7 @@ restart: rcu_read_lock(); - if (tag == -1) + if (tag == XFS_ICI_NO_TAG) nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void **)batch, first_index, XFS_LOOKUP_BATCH); @@ -833,7 +864,7 @@ restart: for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i]; - if (done || xfs_inode_ag_walk_grab(ip, iter_flags)) + if (done || !xfs_inode_walk_ag_grab(ip, iter_flags)) batch[i] = NULL; /* @@ -852,7 +883,7 @@ restart: continue; first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - done = 1; + done = true; } /* unlock now we've grabbed the inodes. */ @@ -861,10 +892,10 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; - if ((iter_flags & XFS_AGITER_INEW_WAIT) && + if ((iter_flags & XFS_INODE_WALK_INEW_WAIT) && xfs_iflags_test(batch[i], XFS_INEW)) xfs_inew_wait(batch[i]); - error = execute(batch[i], flags, args); + error = execute(batch[i], args); xfs_irele(batch[i]); if (error == -EAGAIN) { skipped++; @@ -889,6 +920,49 @@ restart: return last_error; } +/* Fetch the next (possibly tagged) per-AG structure. */ +static inline struct xfs_perag * +xfs_inode_walk_get_perag( + struct xfs_mount *mp, + xfs_agnumber_t agno, + int tag) +{ + if (tag == XFS_ICI_NO_TAG) + return xfs_perag_get(mp, agno); + return xfs_perag_get_tag(mp, agno, tag); +} + +/* + * Call the @execute function on all incore inodes matching the radix tree + * @tag. + */ +int +xfs_inode_walk( + struct xfs_mount *mp, + int iter_flags, + int (*execute)(struct xfs_inode *ip, void *args), + void *args, + int tag) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + + ag = 0; + while ((pag = xfs_inode_walk_get_perag(mp, ag, tag))) { + ag = pag->pag_agno + 1; + error = xfs_inode_walk_ag(pag, iter_flags, execute, args, tag); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == -EFSCORRUPTED) + break; + } + } + return last_error; +} + /* * Background scanning to trim post-EOF preallocated space. This is queued * based on the 'speculative_prealloc_lifetime' tunable (5m by default). @@ -952,75 +1026,6 @@ xfs_cowblocks_worker( xfs_queue_cowblocks(mp); } -int -xfs_inode_ag_iterator_flags( - struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, int flags, - void *args), - int flags, - void *args, - int iter_flags) -{ - struct xfs_perag *pag; - int error = 0; - int last_error = 0; - xfs_agnumber_t ag; - - ag = 0; - while ((pag = xfs_perag_get(mp, ag))) { - ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1, - iter_flags); - xfs_perag_put(pag); - if (error) { - last_error = error; - if (error == -EFSCORRUPTED) - break; - } - } - return last_error; -} - -int -xfs_inode_ag_iterator( - struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, int flags, - void *args), - int flags, - void *args) -{ - return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0); -} - -int -xfs_inode_ag_iterator_tag( - struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, int flags, - void *args), - int flags, - void *args, - int tag) -{ - struct xfs_perag *pag; - int error = 0; - int last_error = 0; - xfs_agnumber_t ag; - - ag = 0; - while ((pag = xfs_perag_get_tag(mp, ag, tag))) { - ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag, - 0); - xfs_perag_put(pag); - if (error) { - last_error = error; - if (error == -EFSCORRUPTED) - break; - } - } - return last_error; -} - /* * Grab the inode for reclaim exclusively. * Return 0 if we grabbed it, non-zero otherwise. @@ -1128,7 +1133,7 @@ restart: if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); /* xfs_iflush_abort() drops the flush lock */ - xfs_iflush_abort(ip, false); + xfs_iflush_abort(ip); goto reclaim; } if (xfs_ipincount(ip)) { @@ -1419,59 +1424,90 @@ xfs_reclaim_inodes_count( return reclaimable; } -STATIC int +STATIC bool xfs_inode_match_id( struct xfs_inode *ip, struct xfs_eofblocks *eofb) { if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) - return 0; + return false; if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) - return 0; + return false; if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && ip->i_d.di_projid != eofb->eof_prid) - return 0; + return false; - return 1; + return true; } /* * A union-based inode filtering algorithm. Process the inode if any of the * criteria match. This is for global/internal scans only. */ -STATIC int +STATIC bool xfs_inode_match_id_union( struct xfs_inode *ip, struct xfs_eofblocks *eofb) { if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) - return 1; + return true; if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) - return 1; + return true; if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && ip->i_d.di_projid == eofb->eof_prid) - return 1; + return true; - return 0; + return false; +} + +/* + * Is this inode @ip eligible for eof/cow block reclamation, given some + * filtering parameters @eofb? The inode is eligible if @eofb is null or + * if the predicate functions match. + */ +static bool +xfs_inode_matches_eofb( + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) +{ + bool match; + + if (!eofb) + return true; + + if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) + match = xfs_inode_match_id_union(ip, eofb); + else + match = xfs_inode_match_id(ip, eofb); + if (!match) + return false; + + /* skip the inode if the file size is too small */ + if ((eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) && + XFS_ISIZE(ip) < eofb->eof_min_file_size) + return false; + + return true; } STATIC int xfs_inode_free_eofblocks( struct xfs_inode *ip, - int flags, void *args) { - int ret = 0; - struct xfs_eofblocks *eofb = args; - int match; + struct xfs_eofblocks *eofb = args; + bool wait; + int ret; + + wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); if (!xfs_can_free_eofblocks(ip, false)) { /* inode could be preallocated or append-only */ @@ -1484,62 +1520,34 @@ xfs_inode_free_eofblocks( * If the mapping is dirty the operation can block and wait for some * time. Unless we are waiting, skip it. */ - if (!(flags & SYNC_WAIT) && - mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) + if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) return 0; - if (eofb) { - if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) - match = xfs_inode_match_id_union(ip, eofb); - else - match = xfs_inode_match_id(ip, eofb); - if (!match) - return 0; - - /* skip the inode if the file size is too small */ - if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && - XFS_ISIZE(ip) < eofb->eof_min_file_size) - return 0; - } + if (!xfs_inode_matches_eofb(ip, eofb)) + return 0; /* * If the caller is waiting, return -EAGAIN to keep the background * scanner moving and revisit the inode in a subsequent pass. */ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { - if (flags & SYNC_WAIT) - ret = -EAGAIN; - return ret; + if (wait) + return -EAGAIN; + return 0; } + ret = xfs_free_eofblocks(ip); xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } -static int -__xfs_icache_free_eofblocks( - struct xfs_mount *mp, - struct xfs_eofblocks *eofb, - int (*execute)(struct xfs_inode *ip, int flags, - void *args), - int tag) -{ - int flags = SYNC_TRYLOCK; - - if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) - flags = SYNC_WAIT; - - return xfs_inode_ag_iterator_tag(mp, execute, flags, - eofb, tag); -} - int xfs_icache_free_eofblocks( struct xfs_mount *mp, struct xfs_eofblocks *eofb) { - return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks, + return xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb, XFS_ICI_EOFBLOCKS_TAG); } @@ -1756,29 +1764,16 @@ xfs_prep_free_cowblocks( STATIC int xfs_inode_free_cowblocks( struct xfs_inode *ip, - int flags, void *args) { struct xfs_eofblocks *eofb = args; - int match; int ret = 0; if (!xfs_prep_free_cowblocks(ip)) return 0; - if (eofb) { - if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) - match = xfs_inode_match_id_union(ip, eofb); - else - match = xfs_inode_match_id(ip, eofb); - if (!match) - return 0; - - /* skip the inode if the file size is too small */ - if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && - XFS_ISIZE(ip) < eofb->eof_min_file_size) - return 0; - } + if (!xfs_inode_matches_eofb(ip, eofb)) + return 0; /* Free the CoW blocks */ xfs_ilock(ip, XFS_IOLOCK_EXCL); @@ -1802,7 +1797,7 @@ xfs_icache_free_cowblocks( struct xfs_mount *mp, struct xfs_eofblocks *eofb) { - return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks, + return xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb, XFS_ICI_COWBLOCKS_TAG); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 48f1fd2bb6ad..93b54e7d55f0 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -24,7 +24,7 @@ struct xfs_eofblocks { * tags for inode radix tree */ #define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup - in xfs_inode_ag_iterator */ + in xfs_inode_walk */ #define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ #define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ #define XFS_ICI_COWBLOCKS_TAG 2 /* inode can have cow blocks to gc */ @@ -40,7 +40,7 @@ struct xfs_eofblocks { /* * flags for AG inode iterator */ -#define XFS_AGITER_INEW_WAIT 0x1 /* wait on new inodes */ +#define XFS_INODE_WALK_INEW_WAIT 0x1 /* wait on new inodes */ int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp); @@ -71,50 +71,9 @@ int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip); void xfs_cowblocks_worker(struct work_struct *); void xfs_queue_cowblocks(struct xfs_mount *); -int xfs_inode_ag_iterator(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, int flags, void *args), - int flags, void *args); -int xfs_inode_ag_iterator_flags(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, int flags, void *args), - int flags, void *args, int iter_flags); -int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, - int (*execute)(struct xfs_inode *ip, int flags, void *args), - int flags, void *args, int tag); - -static inline int -xfs_fs_eofblocks_from_user( - struct xfs_fs_eofblocks *src, - struct xfs_eofblocks *dst) -{ - if (src->eof_version != XFS_EOFBLOCKS_VERSION) - return -EINVAL; - - if (src->eof_flags & ~XFS_EOF_FLAGS_VALID) - return -EINVAL; - - if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) || - memchr_inv(src->pad64, 0, sizeof(src->pad64))) - return -EINVAL; - - dst->eof_flags = src->eof_flags; - dst->eof_prid = src->eof_prid; - dst->eof_min_file_size = src->eof_min_file_size; - - dst->eof_uid = INVALID_UID; - if (src->eof_flags & XFS_EOF_FLAGS_UID) { - dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid); - if (!uid_valid(dst->eof_uid)) - return -EINVAL; - } - - dst->eof_gid = INVALID_GID; - if (src->eof_flags & XFS_EOF_FLAGS_GID) { - dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid); - if (!gid_valid(dst->eof_gid)) - return -EINVAL; - } - return 0; -} +int xfs_inode_walk(struct xfs_mount *mp, int iter_flags, + int (*execute)(struct xfs_inode *ip, void *args), + void *args, int tag); int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, bool *inuse); diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 490fee22b878..287a9e5c7d75 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -6,11 +6,19 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_format.h" #include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_icreate_item.h" #include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" +#include "xfs_ialloc.h" +#include "xfs_trace.h" kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ @@ -107,3 +115,147 @@ xfs_icreate_log( tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &icp->ic_item.li_flags); } + +static enum xlog_recover_reorder +xlog_recover_icreate_reorder( + struct xlog_recover_item *item) +{ + /* + * Inode allocation buffers must be replayed before subsequent inode + * items try to modify those buffers. ICREATE items are the logical + * equivalent of logging a newly initialized inode buffer, so recover + * these at the same time that we recover logged buffers. + */ + return XLOG_REORDER_BUFFER_LIST; +} + +/* + * This routine is called when an inode create format structure is found in a + * committed transaction in the log. It's purpose is to initialise the inodes + * being allocated on disk. This requires us to get inode cluster buffers that + * match the range to be initialised, stamped with inode templates and written + * by delayed write so that subsequent modifications will hit the cached buffer + * and only need writing out at the end of recovery. + */ +STATIC int +xlog_recover_icreate_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_icreate_log *icl; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agnumber_t agno; + xfs_agblock_t agbno; + unsigned int count; + unsigned int isize; + xfs_agblock_t length; + int bb_per_cluster; + int cancel_count; + int nbufs; + int i; + + icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; + if (icl->icl_type != XFS_LI_ICREATE) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); + return -EINVAL; + } + + if (icl->icl_size != 1) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); + return -EINVAL; + } + + agno = be32_to_cpu(icl->icl_ag); + if (agno >= mp->m_sb.sb_agcount) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); + return -EINVAL; + } + agbno = be32_to_cpu(icl->icl_agbno); + if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); + return -EINVAL; + } + isize = be32_to_cpu(icl->icl_isize); + if (isize != mp->m_sb.sb_inodesize) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); + return -EINVAL; + } + count = be32_to_cpu(icl->icl_count); + if (!count) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); + return -EINVAL; + } + length = be32_to_cpu(icl->icl_length); + if (!length || length >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); + return -EINVAL; + } + + /* + * The inode chunk is either full or sparse and we only support + * m_ino_geo.ialloc_min_blks sized sparse allocations at this time. + */ + if (length != igeo->ialloc_blks && + length != igeo->ialloc_min_blks) { + xfs_warn(log->l_mp, + "%s: unsupported chunk length", __FUNCTION__); + return -EINVAL; + } + + /* verify inode count is consistent with extent length */ + if ((count >> mp->m_sb.sb_inopblog) != length) { + xfs_warn(log->l_mp, + "%s: inconsistent inode count and chunk length", + __FUNCTION__); + return -EINVAL; + } + + /* + * The icreate transaction can cover multiple cluster buffers and these + * buffers could have been freed and reused. Check the individual + * buffers for cancellation so we don't overwrite anything written after + * a cancellation. + */ + bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); + nbufs = length / igeo->blocks_per_cluster; + for (i = 0, cancel_count = 0; i < nbufs; i++) { + xfs_daddr_t daddr; + + daddr = XFS_AGB_TO_DADDR(mp, agno, + agbno + i * igeo->blocks_per_cluster); + if (xlog_is_buffer_cancelled(log, daddr, bb_per_cluster)) + cancel_count++; + } + + /* + * We currently only use icreate for a single allocation at a time. This + * means we should expect either all or none of the buffers to be + * cancelled. Be conservative and skip replay if at least one buffer is + * cancelled, but warn the user that something is awry if the buffers + * are not consistent. + * + * XXX: This must be refined to only skip cancelled clusters once we use + * icreate for multiple chunk allocations. + */ + ASSERT(!cancel_count || cancel_count == nbufs); + if (cancel_count) { + if (cancel_count != nbufs) + xfs_warn(mp, + "WARNING: partial inode chunk cancellation, skipped icreate."); + trace_xfs_log_recover_icreate_cancel(log, icl); + return 0; + } + + trace_xfs_log_recover_icreate_recover(log, icl); + return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, + length, be32_to_cpu(icl->icl_gen)); +} + +const struct xlog_recover_item_ops xlog_icreate_item_ops = { + .item_type = XFS_LI_ICREATE, + .reorder = xlog_recover_icreate_reorder, + .commit_pass2 = xlog_recover_icreate_commit_pass2, +}; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d1772786af29..64f5f9a440ae 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -112,7 +112,7 @@ xfs_ilock_data_map_shared( { uint lock_mode = XFS_ILOCK_SHARED; - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE && (ip->i_df.if_flags & XFS_IFEXTENTS) == 0) lock_mode = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_mode); @@ -125,7 +125,8 @@ xfs_ilock_attr_map_shared( { uint lock_mode = XFS_ILOCK_SHARED; - if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE && + if (ip->i_afp && + ip->i_afp->if_format == XFS_DINODE_FMT_BTREE && (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0) lock_mode = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_mode); @@ -825,7 +826,7 @@ xfs_ialloc( inode->i_mode &= ~S_ISGID; ip->i_d.di_size = 0; - ip->i_d.di_nextents = 0; + ip->i_df.if_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); tv = current_time(inode); @@ -851,7 +852,7 @@ xfs_ialloc( case S_IFCHR: case S_IFBLK: case S_IFSOCK: - ip->i_d.di_format = XFS_DINODE_FMT_DEV; + ip->i_df.if_format = XFS_DINODE_FMT_DEV; ip->i_df.if_flags = 0; flags |= XFS_ILOG_DEV; break; @@ -907,7 +908,7 @@ xfs_ialloc( } /* FALLTHROUGH */ case S_IFLNK: - ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; + ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; ip->i_df.if_flags = XFS_IFEXTENTS; ip->i_df.if_bytes = 0; ip->i_df.if_u1.if_root = NULL; @@ -915,11 +916,6 @@ xfs_ialloc( default: ASSERT(0); } - /* - * Attribute fork settings for new inode. - */ - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - ip->i_d.di_anextents = 0; /* * Log the new values stuffed into the inode. @@ -1686,7 +1682,7 @@ xfs_inactive_truncate( if (error) goto error_trans_cancel; - ASSERT(ip->i_d.di_nextents == 0); + ASSERT(ip->i_df.if_nextents == 0); error = xfs_trans_commit(tp); if (error) @@ -1836,7 +1832,7 @@ xfs_inactive( if (S_ISREG(VFS_I(ip)->i_mode) && (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || - ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) + ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) truncate = 1; error = xfs_qm_dqattach(ip); @@ -1862,7 +1858,6 @@ xfs_inactive( } ASSERT(!ip->i_afp); - ASSERT(ip->i_d.di_anextents == 0); ASSERT(ip->i_d.di_forkoff == 0); /* @@ -2172,7 +2167,7 @@ xfs_iunlink_update_inode( ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0); if (error) return error; @@ -2302,7 +2297,7 @@ xfs_iunlink_map_ino( return error; } - error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0); + error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0); if (error) { xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", __func__, error); @@ -2602,7 +2597,7 @@ xfs_ifree_cluster( xfs_daddr_t blkno; xfs_buf_t *bp; xfs_inode_t *ip; - xfs_inode_log_item_t *iip; + struct xfs_inode_log_item *iip; struct xfs_log_item *lip; struct xfs_perag *pag; struct xfs_ino_geometry *igeo = M_IGEO(mp); @@ -2662,7 +2657,7 @@ xfs_ifree_cluster( */ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { if (lip->li_type == XFS_LI_INODE) { - iip = (xfs_inode_log_item_t *)lip; + iip = (struct xfs_inode_log_item *)lip; ASSERT(iip->ili_logged == 1); lip->li_cb = xfs_istale_done; xfs_trans_ail_copy_lsn(mp->m_ail, @@ -2712,24 +2707,6 @@ xfs_ifree_cluster( } /* - * Free any local-format buffers sitting around before we reset to - * extents format. - */ -static inline void -xfs_ifree_local_data( - struct xfs_inode *ip, - int whichfork) -{ - struct xfs_ifork *ifp; - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) - return; - - ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); -} - -/* * This is called to return an inode to the inode free list. * The inode should already be truncated to 0 length and have * no pages associated with it. This routine also assumes that @@ -2749,8 +2726,7 @@ xfs_ifree( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(VFS_I(ip)->i_nlink == 0); - ASSERT(ip->i_d.di_nextents == 0); - ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_df.if_nextents == 0); ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); ASSERT(ip->i_d.di_nblocks == 0); @@ -2765,16 +2741,23 @@ xfs_ifree( if (error) return error; - xfs_ifree_local_data(ip, XFS_DATA_FORK); - xfs_ifree_local_data(ip, XFS_ATTR_FORK); + /* + * Free any local-format data sitting around before we reset the + * data fork to extents format. Note that the attr fork data has + * already been freed by xfs_attr_inactive. + */ + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { + kmem_free(ip->i_df.if_u1.if_data); + ip->i_df.if_u1.if_data = NULL; + ip->i_df.if_bytes = 0; + } VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; ip->i_d.di_flags2 = 0; ip->i_d.di_dmevmask = 0; ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ - ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; /* Don't attempt to replay owner changes for a deleted inode */ ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER); @@ -3496,6 +3479,7 @@ xfs_iflush_cluster( struct xfs_inode **cilist; struct xfs_inode *cip; struct xfs_ino_geometry *igeo = M_IGEO(mp); + int error = 0; int nr_found; int clcount = 0; int i; @@ -3588,11 +3572,10 @@ xfs_iflush_cluster( * re-check that it's dirty before flushing. */ if (!xfs_inode_clean(cip)) { - int error; error = xfs_iflush_int(cip, bp); if (error) { xfs_iunlock(cip, XFS_ILOCK_SHARED); - goto cluster_corrupt_out; + goto out_free; } clcount++; } else { @@ -3611,37 +3594,7 @@ out_free: kmem_free(cilist); out_put: xfs_perag_put(pag); - return 0; - - -cluster_corrupt_out: - /* - * Corruption detected in the clustering loop. Invalidate the - * inode buffer and shut down the filesystem. - */ - rcu_read_unlock(); - - /* - * We'll always have an inode attached to the buffer for completion - * process by the time we are called from xfs_iflush(). Hence we have - * always need to do IO completion processing to abort the inodes - * attached to the buffer. handle them just like the shutdown case in - * xfs_buf_submit(). - */ - ASSERT(bp->b_iodone); - bp->b_flags |= XBF_ASYNC; - bp->b_flags &= ~XBF_DONE; - xfs_buf_stale(bp); - xfs_buf_ioerror(bp, -EIO); - xfs_buf_ioend(bp); - - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - - /* abort the corrupt inode, as it was not attached to the buffer */ - xfs_iflush_abort(cip, false); - kmem_free(cilist); - xfs_perag_put(pag); - return -EFSCORRUPTED; + return error; } /* @@ -3667,8 +3620,8 @@ xfs_iflush( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); - ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); + ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || + ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); *bpp = NULL; @@ -3688,42 +3641,20 @@ xfs_iflush( } /* - * This may have been unpinned because the filesystem is shutting - * down forcibly. If that's the case we must not write this inode - * to disk, because the log record didn't make it to disk. - * - * We also have to remove the log item from the AIL in this case, - * as we wait for an empty AIL as part of the unmount process. - */ - if (XFS_FORCED_SHUTDOWN(mp)) { - error = -EIO; - goto abort_out; - } - - /* * Get the buffer containing the on-disk inode. We are doing a try-lock - * operation here, so we may get an EAGAIN error. In that case, we - * simply want to return with the inode still dirty. + * operation here, so we may get an EAGAIN error. In that case, return + * leaving the inode dirty. * * If we get any other error, we effectively have a corruption situation - * and we cannot flush the inode, so we treat it the same as failing - * xfs_iflush_int(). + * and we cannot flush the inode. Abort the flush and shut down. */ - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, - 0); + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK); if (error == -EAGAIN) { xfs_ifunlock(ip); return error; } if (error) - goto corrupt_out; - - /* - * First flush out the inode that xfs_iflush was called with. - */ - error = xfs_iflush_int(ip, bp); - if (error) - goto corrupt_out; + goto abort; /* * If the buffer is pinned then push on the log now so we won't @@ -3733,61 +3664,32 @@ xfs_iflush( xfs_log_force(mp, 0); /* - * inode clustering: try to gather other inodes into this write + * Flush the provided inode then attempt to gather others from the + * cluster into the write. * - * Note: Any error during clustering will result in the filesystem - * being shut down and completion callbacks run on the cluster buffer. - * As we have already flushed and attached this inode to the buffer, - * it has already been aborted and released by xfs_iflush_cluster() and - * so we have no further error handling to do here. + * Note: Once we attempt to flush an inode, we must run buffer + * completion callbacks on any failure. If this fails, simulate an I/O + * failure on the buffer and shut down. */ - error = xfs_iflush_cluster(ip, bp); - if (error) - return error; + error = xfs_iflush_int(ip, bp); + if (!error) + error = xfs_iflush_cluster(ip, bp); + if (error) { + bp->b_flags |= XBF_ASYNC; + xfs_buf_ioend_fail(bp); + goto shutdown; + } *bpp = bp; return 0; -corrupt_out: - if (bp) - xfs_buf_relse(bp); +abort: + xfs_iflush_abort(ip); +shutdown: xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); -abort_out: - /* abort the corrupt inode, as it was not attached to the buffer */ - xfs_iflush_abort(ip, false); return error; } -/* - * If there are inline format data / attr forks attached to this inode, - * make sure they're not corrupt. - */ -bool -xfs_inode_verify_forks( - struct xfs_inode *ip) -{ - struct xfs_ifork *ifp; - xfs_failaddr_t fa; - - fa = xfs_ifork_verify_data(ip, &xfs_default_ifork_ops); - if (fa) { - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork", - ifp->if_u1.if_data, ifp->if_bytes, fa); - return false; - } - - fa = xfs_ifork_verify_attr(ip, &xfs_default_ifork_ops); - if (fa) { - ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK); - xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", - ifp ? ifp->if_u1.if_data : NULL, - ifp ? ifp->if_bytes : 0, fa); - return false; - } - return true; -} - STATIC int xfs_iflush_int( struct xfs_inode *ip, @@ -3796,61 +3698,68 @@ xfs_iflush_int( struct xfs_inode_log_item *iip = ip->i_itemp; struct xfs_dinode *dip; struct xfs_mount *mp = ip->i_mount; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); - ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); + ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || + ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); ASSERT(iip != NULL && iip->ili_fields != 0); - /* set *dip = inode's place in the buffer */ dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); + /* + * We don't flush the inode if any of the following checks fail, but we + * do still update the log item and attach to the backing buffer as if + * the flush happened. This is a formality to facilitate predictable + * error handling as the caller will shutdown and fail the buffer. + */ + error = -EFSCORRUPTED; if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad inode %Lu magic number 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); - goto corrupt_out; + goto flush_out; } if (S_ISREG(VFS_I(ip)->i_mode)) { if (XFS_TEST_ERROR( - (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && - (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), + ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE, mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad regular inode %Lu, ptr "PTR_FMT, __func__, ip->i_ino, ip); - goto corrupt_out; + goto flush_out; } } else if (S_ISDIR(VFS_I(ip)->i_mode)) { if (XFS_TEST_ERROR( - (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && - (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && - (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), + ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE && + ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad directory inode %Lu, ptr "PTR_FMT, __func__, ip->i_ino, ip); - goto corrupt_out; + goto flush_out; } } - if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > + if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) > ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: detected corrupt incore inode %Lu, " "total extents = %d, nblocks = %Ld, ptr "PTR_FMT, __func__, ip->i_ino, - ip->i_d.di_nextents + ip->i_d.di_anextents, + ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp), ip->i_d.di_nblocks, ip); - goto corrupt_out; + goto flush_out; } if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: bad inode %Lu, forkoff 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_d.di_forkoff, ip); - goto corrupt_out; + goto flush_out; } /* @@ -3865,9 +3774,16 @@ xfs_iflush_int( if (!xfs_sb_version_has_v3inode(&mp->m_sb)) ip->i_d.di_flushiter++; - /* Check the inline fork data before we write out. */ - if (!xfs_inode_verify_forks(ip)) - goto corrupt_out; + /* + * If there are inline format data / attr forks attached to this inode, + * make sure they are not corrupt. + */ + if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL && + xfs_ifork_verify_local_data(ip)) + goto flush_out; + if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL && + xfs_ifork_verify_local_attr(ip)) + goto flush_out; /* * Copy the dirty parts of the inode into the on-disk inode. We always @@ -3910,6 +3826,8 @@ xfs_iflush_int( * need the AIL lock, because it is a 64 bit value that cannot be read * atomically. */ + error = 0; +flush_out: iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; @@ -3919,10 +3837,10 @@ xfs_iflush_int( &iip->ili_item.li_lsn); /* - * Attach the function xfs_iflush_done to the inode's - * buffer. This will remove the inode from the AIL - * and unlock the inode's flush lock when the inode is - * completely written to disk. + * Attach the inode item callback to the buffer whether the flush + * succeeded or not. If not, the caller will shut down and fail I/O + * completion on the buffer to remove the inode from the AIL and release + * the flush lock. */ xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); @@ -3931,10 +3849,7 @@ xfs_iflush_int( ASSERT(!list_empty(&bp->b_li_list)); ASSERT(bp->b_iodone != NULL); - return 0; - -corrupt_out: - return -EFSCORRUPTED; + return error; } /* Release an inode. */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c6a63f6764a6..47d3b391030d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -57,9 +57,6 @@ typedef struct xfs_inode { struct xfs_icdinode i_d; /* most of ondisk inode */ - xfs_extnum_t i_cnextents; /* # of extents in cow fork */ - unsigned int i_cformat; /* format of cow fork */ - /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ @@ -218,8 +215,7 @@ static inline bool xfs_inode_has_cow_data(struct xfs_inode *ip) #define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT) #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) -#define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */ -#define XFS_IEOFBLOCKS (1 << 10)/* has the preallocblocks tag set */ +#define XFS_IEOFBLOCKS (1 << 9) /* has the preallocblocks tag set */ /* * If this unlinked inode is in the middle of recovery, don't let drop_inode * truncate and free the inode. This can happen if we iget the inode during @@ -467,6 +463,7 @@ int xfs_break_layouts(struct inode *inode, uint *iolock, /* from xfs_iops.c */ extern void xfs_setup_inode(struct xfs_inode *ip); extern void xfs_setup_iops(struct xfs_inode *ip); +extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); /* * When setting up a newly allocated inode, we need to call @@ -497,8 +494,6 @@ extern struct kmem_zone *xfs_inode_zone; /* The default CoW extent size hint. */ #define XFS_DEFAULT_COWEXTSZ_HINT 32 -bool xfs_inode_verify_forks(struct xfs_inode *ip); - int xfs_iunlink_init(struct xfs_perag *pag); void xfs_iunlink_destroy(struct xfs_perag *pag); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index f779cca2346f..ba47bf65b772 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -36,10 +36,10 @@ xfs_inode_item_data_fork_size( { struct xfs_inode *ip = iip->ili_inode; - switch (ip->i_d.di_format) { + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: if ((iip->ili_fields & XFS_ILOG_DEXT) && - ip->i_d.di_nextents > 0 && + ip->i_df.if_nextents > 0 && ip->i_df.if_bytes > 0) { /* worst case, doesn't subtract delalloc extents */ *nbytes += XFS_IFORK_DSIZE(ip); @@ -77,10 +77,10 @@ xfs_inode_item_attr_fork_size( { struct xfs_inode *ip = iip->ili_inode; - switch (ip->i_d.di_aformat) { + switch (ip->i_afp->if_format) { case XFS_DINODE_FMT_EXTENTS: if ((iip->ili_fields & XFS_ILOG_AEXT) && - ip->i_d.di_anextents > 0 && + ip->i_afp->if_nextents > 0 && ip->i_afp->if_bytes > 0) { /* worst case, doesn't subtract unused space */ *nbytes += XFS_IFORK_ASIZE(ip); @@ -142,13 +142,13 @@ xfs_inode_item_format_data_fork( struct xfs_inode *ip = iip->ili_inode; size_t data_bytes; - switch (ip->i_d.di_format) { + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: iip->ili_fields &= ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEV); if ((iip->ili_fields & XFS_ILOG_DEXT) && - ip->i_d.di_nextents > 0 && + ip->i_df.if_nextents > 0 && ip->i_df.if_bytes > 0) { struct xfs_bmbt_rec *p; @@ -227,18 +227,18 @@ xfs_inode_item_format_attr_fork( struct xfs_inode *ip = iip->ili_inode; size_t data_bytes; - switch (ip->i_d.di_aformat) { + switch (ip->i_afp->if_format) { case XFS_DINODE_FMT_EXTENTS: iip->ili_fields &= ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); if ((iip->ili_fields & XFS_ILOG_AEXT) && - ip->i_d.di_anextents > 0 && + ip->i_afp->if_nextents > 0 && ip->i_afp->if_bytes > 0) { struct xfs_bmbt_rec *p; ASSERT(xfs_iext_count(ip->i_afp) == - ip->i_d.di_anextents); + ip->i_afp->if_nextents); p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT); data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK); @@ -305,7 +305,7 @@ xfs_inode_to_log_dinode( struct inode *inode = VFS_I(ip); to->di_magic = XFS_DINODE_MAGIC; - to->di_format = from->di_format; + to->di_format = xfs_ifork_format(&ip->i_df); to->di_uid = i_uid_read(inode); to->di_gid = i_gid_read(inode); to->di_projid_lo = from->di_projid & 0xffff; @@ -326,10 +326,10 @@ xfs_inode_to_log_dinode( to->di_size = from->di_size; to->di_nblocks = from->di_nblocks; to->di_extsize = from->di_extsize; - to->di_nextents = from->di_nextents; - to->di_anextents = from->di_anextents; + to->di_nextents = xfs_ifork_nextents(&ip->i_df); + to->di_anextents = xfs_ifork_nextents(ip->i_afp); to->di_forkoff = from->di_forkoff; - to->di_aformat = from->di_aformat; + to->di_aformat = xfs_ifork_format(ip->i_afp); to->di_dmevmask = from->di_dmevmask; to->di_dmstate = from->di_dmstate; to->di_flags = from->di_flags; @@ -497,21 +497,6 @@ xfs_inode_item_push( if (xfs_ipincount(ip) > 0) return XFS_ITEM_PINNED; - /* - * The buffer containing this item failed to be written back - * previously. Resubmit the buffer for IO. - */ - if (test_bit(XFS_LI_FAILED, &lip->li_flags)) { - if (!xfs_buf_trylock(bp)) - return XFS_ITEM_LOCKED; - - if (!xfs_buf_resubmit_failed_buffers(bp, buffer_list)) - rval = XFS_ITEM_FLUSHING; - - xfs_buf_unlock(bp); - return rval; - } - if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) return XFS_ITEM_LOCKED; @@ -777,17 +762,12 @@ xfs_iflush_done( */ void xfs_iflush_abort( - xfs_inode_t *ip, - bool stale) + struct xfs_inode *ip) { - xfs_inode_log_item_t *iip = ip->i_itemp; + struct xfs_inode_log_item *iip = ip->i_itemp; if (iip) { - if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)) { - xfs_trans_ail_remove(&iip->ili_item, - stale ? SHUTDOWN_LOG_IO_ERROR : - SHUTDOWN_CORRUPT_INCORE); - } + xfs_trans_ail_delete(&iip->ili_item, 0); iip->ili_logged = 0; /* * Clear the ili_last_fields bits now that we know that the @@ -812,7 +792,7 @@ xfs_istale_done( struct xfs_buf *bp, struct xfs_log_item *lip) { - xfs_iflush_abort(INODE_ITEM(lip)->ili_inode, true); + xfs_iflush_abort(INODE_ITEM(lip)->ili_inode); } /* diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 07a60e74c39c..60b34bb66e8e 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -13,7 +13,7 @@ struct xfs_bmbt_rec; struct xfs_inode; struct xfs_mount; -typedef struct xfs_inode_log_item { +struct xfs_inode_log_item { struct xfs_log_item ili_item; /* common portion */ struct xfs_inode *ili_inode; /* inode ptr */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ @@ -23,7 +23,7 @@ typedef struct xfs_inode_log_item { unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ unsigned int ili_fsync_fields; /* logged since last fsync */ -} xfs_inode_log_item_t; +}; static inline int xfs_inode_clean(xfs_inode_t *ip) { @@ -34,7 +34,7 @@ extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *); extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *); -extern void xfs_iflush_abort(struct xfs_inode *, bool); +extern void xfs_iflush_abort(struct xfs_inode *); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, struct xfs_inode_log_format *); diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c new file mode 100644 index 000000000000..dc3e26ff16c9 --- /dev/null +++ b/fs/xfs/xfs_inode_item_recover.c @@ -0,0 +1,394 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_trace.h" +#include "xfs_trans_priv.h" +#include "xfs_buf_item.h" +#include "xfs_log.h" +#include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" +#include "xfs_icache.h" +#include "xfs_bmap_btree.h" + +STATIC void +xlog_recover_inode_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { + struct xfs_inode_log_format *ilfp = item->ri_buf[0].i_addr; + + xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, + &xfs_inode_buf_ra_ops); + } else { + struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].i_addr; + + xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, + &xfs_inode_buf_ra_ops); + } +} + +/* + * Inode fork owner changes + * + * If we have been told that we have to reparent the inode fork, it's because an + * extent swap operation on a CRC enabled filesystem has been done and we are + * replaying it. We need to walk the BMBT of the appropriate fork and change the + * owners of it. + * + * The complexity here is that we don't have an inode context to work with, so + * after we've replayed the inode we need to instantiate one. This is where the + * fun begins. + * + * We are in the middle of log recovery, so we can't run transactions. That + * means we cannot use cache coherent inode instantiation via xfs_iget(), as + * that will result in the corresponding iput() running the inode through + * xfs_inactive(). If we've just replayed an inode core that changes the link + * count to zero (i.e. it's been unlinked), then xfs_inactive() will run + * transactions (bad!). + * + * So, to avoid this, we instantiate an inode directly from the inode core we've + * just recovered. We have the buffer still locked, and all we really need to + * instantiate is the inode core and the forks being modified. We can do this + * manually, then run the inode btree owner change, and then tear down the + * xfs_inode without having to run any transactions at all. + * + * Also, because we don't have a transaction context available here but need to + * gather all the buffers we modify for writeback so we pass the buffer_list + * instead for the operation to use. + */ + +STATIC int +xfs_recover_inode_owner_change( + struct xfs_mount *mp, + struct xfs_dinode *dip, + struct xfs_inode_log_format *in_f, + struct list_head *buffer_list) +{ + struct xfs_inode *ip; + int error; + + ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); + + ip = xfs_inode_alloc(mp, in_f->ilf_ino); + if (!ip) + return -ENOMEM; + + /* instantiate the inode */ + ASSERT(dip->di_version >= 3); + + error = xfs_inode_from_disk(ip, dip); + if (error) + goto out_free_ip; + + if (in_f->ilf_fields & XFS_ILOG_DOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + + if (in_f->ilf_fields & XFS_ILOG_AOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + +out_free_ip: + xfs_inode_free(ip); + return error; +} + +STATIC int +xlog_recover_inode_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) +{ + struct xfs_inode_log_format *in_f; + struct xfs_mount *mp = log->l_mp; + struct xfs_buf *bp; + struct xfs_dinode *dip; + int len; + char *src; + char *dest; + int error; + int attr_index; + uint fields; + struct xfs_log_dinode *ldip; + uint isize; + int need_free = 0; + + if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { + in_f = item->ri_buf[0].i_addr; + } else { + in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0); + need_free = 1; + error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); + if (error) + goto error; + } + + /* + * Inode buffers can be freed, look out for it, + * and do not replay the inode. + */ + if (xlog_is_buffer_cancelled(log, in_f->ilf_blkno, in_f->ilf_len)) { + error = 0; + trace_xfs_log_recover_inode_cancel(log, in_f); + goto error; + } + trace_xfs_log_recover_inode_recover(log, in_f); + + error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, + 0, &bp, &xfs_inode_buf_ops); + if (error) + goto error; + ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); + dip = xfs_buf_offset(bp, in_f->ilf_boffset); + + /* + * Make sure the place we're flushing out to really looks + * like an inode! + */ + if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) { + xfs_alert(mp, + "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", + __func__, dip, bp, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } + ldip = item->ri_buf[1].i_addr; + if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { + xfs_alert(mp, + "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld", + __func__, item, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } + + /* + * If the inode has an LSN in it, recover the inode only if it's less + * than the lsn of the transaction we are replaying. Note: we still + * need to replay an owner change even though the inode is more recent + * than the transaction as there is no guarantee that all the btree + * blocks are more recent than this transaction, too. + */ + if (dip->di_version >= 3) { + xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_owner_change; + } + } + + /* + * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes + * are transactional and if ordering is necessary we can determine that + * more accurately by the LSN field in the V3 inode core. Don't trust + * the inode versions we might be changing them here - use the + * superblock flag to determine whether we need to look at di_flushiter + * to skip replay when the on disk inode is newer than the log one + */ + if (!xfs_sb_version_has_v3inode(&mp->m_sb) && + ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { + /* + * Deal with the wrap case, DI_MAX_FLUSH is less + * than smaller numbers + */ + if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && + ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { + /* do nothing */ + } else { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_release; + } + } + + /* Take the opportunity to reset the flush iteration count */ + ldip->di_flushiter = 0; + + if (unlikely(S_ISREG(ldip->di_mode))) { + if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && + (ldip->di_format != XFS_DINODE_FMT_BTREE)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); + xfs_alert(mp, + "%s: Bad regular inode log record, rec ptr "PTR_FMT", " + "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", + __func__, item, dip, bp, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } + } else if (unlikely(S_ISDIR(ldip->di_mode))) { + if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && + (ldip->di_format != XFS_DINODE_FMT_BTREE) && + (ldip->di_format != XFS_DINODE_FMT_LOCAL)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); + xfs_alert(mp, + "%s: Bad dir inode log record, rec ptr "PTR_FMT", " + "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", + __func__, item, dip, bp, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } + } + if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){ + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); + xfs_alert(mp, + "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " + "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld", + __func__, item, dip, bp, in_f->ilf_ino, + ldip->di_nextents + ldip->di_anextents, + ldip->di_nblocks); + error = -EFSCORRUPTED; + goto out_release; + } + if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); + xfs_alert(mp, + "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " + "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__, + item, dip, bp, in_f->ilf_ino, ldip->di_forkoff); + error = -EFSCORRUPTED; + goto out_release; + } + isize = xfs_log_dinode_size(mp); + if (unlikely(item->ri_buf[1].i_len > isize)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", + XFS_ERRLEVEL_LOW, mp, ldip, + sizeof(*ldip)); + xfs_alert(mp, + "%s: Bad inode log record length %d, rec ptr "PTR_FMT, + __func__, item->ri_buf[1].i_len, item); + error = -EFSCORRUPTED; + goto out_release; + } + + /* recover the log dinode inode into the on disk inode */ + xfs_log_dinode_to_disk(ldip, dip); + + fields = in_f->ilf_fields; + if (fields & XFS_ILOG_DEV) + xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); + + if (in_f->ilf_size == 2) + goto out_owner_change; + len = item->ri_buf[2].i_len; + src = item->ri_buf[2].i_addr; + ASSERT(in_f->ilf_size <= 4); + ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); + ASSERT(!(fields & XFS_ILOG_DFORK) || + (len == in_f->ilf_dsize)); + + switch (fields & XFS_ILOG_DFORK) { + case XFS_ILOG_DDATA: + case XFS_ILOG_DEXT: + memcpy(XFS_DFORK_DPTR(dip), src, len); + break; + + case XFS_ILOG_DBROOT: + xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, + (struct xfs_bmdr_block *)XFS_DFORK_DPTR(dip), + XFS_DFORK_DSIZE(dip, mp)); + break; + + default: + /* + * There are no data fork flags set. + */ + ASSERT((fields & XFS_ILOG_DFORK) == 0); + break; + } + + /* + * If we logged any attribute data, recover it. There may or + * may not have been any other non-core data logged in this + * transaction. + */ + if (in_f->ilf_fields & XFS_ILOG_AFORK) { + if (in_f->ilf_fields & XFS_ILOG_DFORK) { + attr_index = 3; + } else { + attr_index = 2; + } + len = item->ri_buf[attr_index].i_len; + src = item->ri_buf[attr_index].i_addr; + ASSERT(len == in_f->ilf_asize); + + switch (in_f->ilf_fields & XFS_ILOG_AFORK) { + case XFS_ILOG_ADATA: + case XFS_ILOG_AEXT: + dest = XFS_DFORK_APTR(dip); + ASSERT(len <= XFS_DFORK_ASIZE(dip, mp)); + memcpy(dest, src, len); + break; + + case XFS_ILOG_ABROOT: + dest = XFS_DFORK_APTR(dip); + xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, + len, (struct xfs_bmdr_block *)dest, + XFS_DFORK_ASIZE(dip, mp)); + break; + + default: + xfs_warn(log->l_mp, "%s: Invalid flag", __func__); + ASSERT(0); + error = -EFSCORRUPTED; + goto out_release; + } + } + +out_owner_change: + /* Recover the swapext owner change unless inode has been deleted */ + if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) && + (dip->di_mode != 0)) + error = xfs_recover_inode_owner_change(mp, dip, in_f, + buffer_list); + /* re-generate the checksum. */ + xfs_dinode_calc_crc(log->l_mp, dip); + + ASSERT(bp->b_mount == mp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); + +out_release: + xfs_buf_relse(bp); +error: + if (need_free) + kmem_free(in_f); + return error; +} + +const struct xlog_recover_item_ops xlog_inode_item_ops = { + .item_type = XFS_LI_INODE, + .ra_pass2 = xlog_recover_inode_ra_pass2, + .commit_pass2 = xlog_recover_inode_commit_pass2, +}; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 309958186d33..a40f88cf3ab7 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1104,26 +1104,17 @@ xfs_fill_fsxattr( bool attr, struct fsxattr *fa) { + struct xfs_ifork *ifp = attr ? ip->i_afp : &ip->i_df; + simple_fill_fsxattr(fa, xfs_ip2xflags(ip)); fa->fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; fa->fsx_cowextsize = ip->i_d.di_cowextsize << ip->i_mount->m_sb.sb_blocklog; fa->fsx_projid = ip->i_d.di_projid; - - if (attr) { - if (ip->i_afp) { - if (ip->i_afp->if_flags & XFS_IFEXTENTS) - fa->fsx_nextents = xfs_iext_count(ip->i_afp); - else - fa->fsx_nextents = ip->i_d.di_anextents; - } else - fa->fsx_nextents = 0; - } else { - if (ip->i_df.if_flags & XFS_IFEXTENTS) - fa->fsx_nextents = xfs_iext_count(&ip->i_df); - else - fa->fsx_nextents = ip->i_d.di_nextents; - } + if (ifp && (ifp->if_flags & XFS_IFEXTENTS)) + fa->fsx_nextents = xfs_iext_count(ifp); + else + fa->fsx_nextents = xfs_ifork_nextents(ifp); } STATIC int @@ -1201,37 +1192,6 @@ xfs_flags2diflags2( return di_flags2; } -STATIC void -xfs_diflags_to_linux( - struct xfs_inode *ip) -{ - struct inode *inode = VFS_I(ip); - unsigned int xflags = xfs_ip2xflags(ip); - - if (xflags & FS_XFLAG_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; - if (xflags & FS_XFLAG_APPEND) - inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; - if (xflags & FS_XFLAG_SYNC) - inode->i_flags |= S_SYNC; - else - inode->i_flags &= ~S_SYNC; - if (xflags & FS_XFLAG_NOATIME) - inode->i_flags |= S_NOATIME; - else - inode->i_flags &= ~S_NOATIME; -#if 0 /* disabled until the flag switching races are sorted out */ - if (xflags & FS_XFLAG_DAX) - inode->i_flags |= S_DAX; - else - inode->i_flags &= ~S_DAX; -#endif -} - static int xfs_ioctl_setattr_xflags( struct xfs_trans *tp, @@ -1242,7 +1202,7 @@ xfs_ioctl_setattr_xflags( uint64_t di_flags2; /* Can't change realtime flag if any extents are allocated. */ - if ((ip->i_d.di_nextents || ip->i_delayed_blks) && + if ((ip->i_df.if_nextents || ip->i_delayed_blks) && XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME)) return -EINVAL; @@ -1269,7 +1229,7 @@ xfs_ioctl_setattr_xflags( ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags); ip->i_d.di_flags2 = di_flags2; - xfs_diflags_to_linux(ip); + xfs_diflags_to_iflags(ip, false); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); @@ -1420,7 +1380,7 @@ xfs_ioctl_setattr_check_extsize( xfs_extlen_t size; xfs_fsblock_t extsize_fsb; - if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_d.di_nextents && + if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents && ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize)) return -EINVAL; @@ -1513,7 +1473,6 @@ xfs_ioctl_setattr( struct fsxattr old_fa; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; - struct xfs_dquot *udqp = NULL; struct xfs_dquot *pdqp = NULL; struct xfs_dquot *olddquot = NULL; int code; @@ -1536,7 +1495,7 @@ xfs_ioctl_setattr( if (XFS_IS_QUOTA_ON(mp)) { code = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid, VFS_I(ip)->i_gid, fa->fsx_projid, - XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); + XFS_QMOPT_PQUOTA, NULL, NULL, &pdqp); if (code) return code; } @@ -1560,7 +1519,7 @@ xfs_ioctl_setattr( if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && ip->i_d.di_projid != fa->fsx_projid) { - code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp, + code = xfs_qm_vop_chown_reserve(tp, ip, NULL, NULL, pdqp, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (code) /* out of quota */ goto error_trans_cancel; @@ -1626,7 +1585,6 @@ xfs_ioctl_setattr( * Release any dquot(s) the inode had kept before chown. */ xfs_qm_dqrele(olddquot); - xfs_qm_dqrele(udqp); xfs_qm_dqrele(pdqp); return code; @@ -1634,7 +1592,6 @@ xfs_ioctl_setattr( error_trans_cancel: xfs_trans_cancel(tp); error_free_dquots: - xfs_qm_dqrele(udqp); xfs_qm_dqrele(pdqp); return code; } @@ -2082,6 +2039,41 @@ out: return error; } +static inline int +xfs_fs_eofblocks_from_user( + struct xfs_fs_eofblocks *src, + struct xfs_eofblocks *dst) +{ + if (src->eof_version != XFS_EOFBLOCKS_VERSION) + return -EINVAL; + + if (src->eof_flags & ~XFS_EOF_FLAGS_VALID) + return -EINVAL; + + if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) || + memchr_inv(src->pad64, 0, sizeof(src->pad64))) + return -EINVAL; + + dst->eof_flags = src->eof_flags; + dst->eof_prid = src->eof_prid; + dst->eof_min_file_size = src->eof_min_file_size; + + dst->eof_uid = INVALID_UID; + if (src->eof_flags & XFS_EOF_FLAGS_UID) { + dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid); + if (!uid_valid(dst->eof_uid)) + return -EINVAL; + } + + dst->eof_gid = INVALID_GID; + if (src->eof_flags & XFS_EOF_FLAGS_GID) { + dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid); + if (!gid_valid(dst->eof_gid)) + return -EINVAL; + } + return 0; +} + /* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index bb590a267a7f..b9a8c3798e08 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -352,22 +352,10 @@ xfs_quota_calc_throttle( } /* - * If we are doing a write at the end of the file and there are no allocations - * past this one, then extend the allocation out to the file system's write - * iosize. - * * If we don't have a user specified preallocation size, dynamically increase * the preallocation size as the size of the file grows. Cap the maximum size * at a single extent or less if the filesystem is near full. The closer the - * filesystem is to full, the smaller the maximum prealocation. - * - * As an exception we don't do any preallocation at all if the file is smaller - * than the minimum preallocation and we are using the default dynamic - * preallocation scheme, as it is likely this is the only write to the file that - * is going to be done. - * - * We clean up any extra space left over when the file is closed in - * xfs_inactive(). + * filesystem is to being full, the smaller the maximum preallocation. */ STATIC xfs_fsblock_t xfs_iomap_prealloc_size( @@ -377,63 +365,70 @@ xfs_iomap_prealloc_size( loff_t count, struct xfs_iext_cursor *icur) { + struct xfs_iext_cursor ncur = *icur; + struct xfs_bmbt_irec prev, got; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); - struct xfs_bmbt_irec prev; - int shift = 0; int64_t freesp; xfs_fsblock_t qblocks; - int qshift = 0; xfs_fsblock_t alloc_blocks = 0; + xfs_extlen_t plen; + int shift = 0; + int qshift = 0; - if (offset + count <= XFS_ISIZE(ip)) - return 0; - - if (!(mp->m_flags & XFS_MOUNT_ALLOCSIZE) && - (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_allocsize_blocks))) + /* + * As an exception we don't do any preallocation at all if the file is + * smaller than the minimum preallocation and we are using the default + * dynamic preallocation scheme, as it is likely this is the only write + * to the file that is going to be done. + */ + if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_allocsize_blocks)) return 0; /* - * If an explicit allocsize is set, the file is small, or we - * are writing behind a hole, then use the minimum prealloc: + * Use the minimum preallocation size for small files or if we are + * writing right after a hole. */ - if ((mp->m_flags & XFS_MOUNT_ALLOCSIZE) || - XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || - !xfs_iext_peek_prev_extent(ifp, icur, &prev) || + if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || + !xfs_iext_prev_extent(ifp, &ncur, &prev) || prev.br_startoff + prev.br_blockcount < offset_fsb) return mp->m_allocsize_blocks; /* - * Determine the initial size of the preallocation. We are beyond the - * current EOF here, but we need to take into account whether this is - * a sparse write or an extending write when determining the - * preallocation size. Hence we need to look up the extent that ends - * at the current write offset and use the result to determine the - * preallocation size. - * - * If the extent is a hole, then preallocation is essentially disabled. - * Otherwise we take the size of the preceding data extent as the basis - * for the preallocation size. If the size of the extent is greater than - * half the maximum extent length, then use the current offset as the - * basis. This ensures that for large files the preallocation size - * always extends to MAXEXTLEN rather than falling short due to things - * like stripe unit/width alignment of real extents. + * Take the size of the preceding data extents as the basis for the + * preallocation size. Note that we don't care if the previous extents + * are written or not. */ - if (prev.br_blockcount <= (MAXEXTLEN >> 1)) - alloc_blocks = prev.br_blockcount << 1; - else + plen = prev.br_blockcount; + while (xfs_iext_prev_extent(ifp, &ncur, &got)) { + if (plen > MAXEXTLEN / 2 || + isnullstartblock(got.br_startblock) || + got.br_startoff + got.br_blockcount != prev.br_startoff || + got.br_startblock + got.br_blockcount != prev.br_startblock) + break; + plen += got.br_blockcount; + prev = got; + } + + /* + * If the size of the extents is greater than half the maximum extent + * length, then use the current offset as the basis. This ensures that + * for large files the preallocation size always extends to MAXEXTLEN + * rather than falling short due to things like stripe unit/width + * alignment of real extents. + */ + alloc_blocks = plen * 2; + if (alloc_blocks > MAXEXTLEN) alloc_blocks = XFS_B_TO_FSB(mp, offset); - if (!alloc_blocks) - goto check_writeio; qblocks = alloc_blocks; /* * MAXEXTLEN is not a power of two value but we round the prealloc down * to the nearest power of two value after throttling. To prevent the - * round down from unconditionally reducing the maximum supported prealloc - * size, we round up first, apply appropriate throttling, round down and - * cap the value to MAXEXTLEN. + * round down from unconditionally reducing the maximum supported + * prealloc size, we round up first, apply appropriate throttling, + * round down and cap the value to MAXEXTLEN. */ alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), alloc_blocks); @@ -494,7 +489,6 @@ xfs_iomap_prealloc_size( */ while (alloc_blocks && alloc_blocks >= freesp) alloc_blocks >>= 4; -check_writeio: if (alloc_blocks < mp->m_allocsize_blocks) alloc_blocks = mp->m_allocsize_blocks; trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift, @@ -563,7 +557,7 @@ xfs_iomap_write_unwritten( xfs_trans_ijoin(tp, ip, 0); error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, - XFS_QMOPT_RES_REGBLKS); + XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES); if (error) goto error_on_bmapi_transaction; @@ -856,7 +850,7 @@ xfs_buffered_write_iomap_begin( xfs_ilock(ip, XFS_ILOCK_EXCL); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, XFS_DATA_FORK)) || + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { error = -EFSCORRUPTED; goto out_unlock; @@ -961,9 +955,16 @@ xfs_buffered_write_iomap_begin( if (error) goto out_unlock; - if (eof) { - prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork, offset, - count, &icur); + if (eof && offset + count > XFS_ISIZE(ip)) { + /* + * Determine the initial size of the preallocation. + * We clean up any extra preallocation when the file is closed. + */ + if (mp->m_flags & XFS_MOUNT_ALLOCSIZE) + prealloc_blocks = mp->m_allocsize_blocks; + else + prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork, + offset, count, &icur); if (prealloc_blocks) { xfs_extlen_t align; xfs_off_t end_offset; @@ -1258,12 +1259,12 @@ xfs_xattr_iomap_begin( lockmode = xfs_ilock_attr_map_shared(ip); /* if there are no attribute fork or extents, return ENOENT */ - if (!XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) { + if (!XFS_IFORK_Q(ip) || !ip->i_afp->if_nextents) { error = -ENOENT; goto out_unlock; } - ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL); + ASSERT(ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, XFS_BMAPI_ATTRFORK); out_unlock: diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f7a99b3bbcf7..d66528fa3657 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -738,12 +738,7 @@ xfs_setattr_nonsize( if (error) /* out of quota */ goto out_cancel; } - } - /* - * Change file ownership. Must be the owner or privileged. - */ - if (mask & (ATTR_UID|ATTR_GID)) { /* * CAP_FSETID overrides the following restrictions: * @@ -877,7 +872,7 @@ xfs_setattr_size( /* * Short circuit the truncate case for zero length files. */ - if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { + if (newsize == 0 && oldsize == 0 && ip->i_df.if_nextents == 0) { if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME))) return 0; @@ -1243,13 +1238,12 @@ xfs_inode_supports_dax( { struct xfs_mount *mp = ip->i_mount; - /* Only supported on non-reflinked files. */ - if (!S_ISREG(VFS_I(ip)->i_mode) || xfs_is_reflink_inode(ip)) + /* Only supported on regular files. */ + if (!S_ISREG(VFS_I(ip)->i_mode)) return false; - /* DAX mount option or DAX iflag must be set. */ - if (!(mp->m_flags & XFS_MOUNT_DAX) && - !(ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) + /* Only supported on non-reflinked files. */ + if (xfs_is_reflink_inode(ip)) return false; /* Block size must match page size */ @@ -1260,26 +1254,51 @@ xfs_inode_supports_dax( return xfs_inode_buftarg(ip)->bt_daxdev != NULL; } -STATIC void +static bool +xfs_inode_should_enable_dax( + struct xfs_inode *ip) +{ + if (!IS_ENABLED(CONFIG_FS_DAX)) + return false; + if (ip->i_mount->m_flags & XFS_MOUNT_DAX_NEVER) + return false; + if (!xfs_inode_supports_dax(ip)) + return false; + if (ip->i_mount->m_flags & XFS_MOUNT_DAX_ALWAYS) + return true; + if (ip->i_d.di_flags2 & XFS_DIFLAG2_DAX) + return true; + return false; +} + +void xfs_diflags_to_iflags( - struct inode *inode, - struct xfs_inode *ip) + struct xfs_inode *ip, + bool init) { - uint16_t flags = ip->i_d.di_flags; - - inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | - S_NOATIME | S_DAX); - - if (flags & XFS_DIFLAG_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; - if (flags & XFS_DIFLAG_APPEND) - inode->i_flags |= S_APPEND; - if (flags & XFS_DIFLAG_SYNC) - inode->i_flags |= S_SYNC; - if (flags & XFS_DIFLAG_NOATIME) - inode->i_flags |= S_NOATIME; - if (xfs_inode_supports_dax(ip)) - inode->i_flags |= S_DAX; + struct inode *inode = VFS_I(ip); + unsigned int xflags = xfs_ip2xflags(ip); + unsigned int flags = 0; + + ASSERT(!(IS_DAX(inode) && init)); + + if (xflags & FS_XFLAG_IMMUTABLE) + flags |= S_IMMUTABLE; + if (xflags & FS_XFLAG_APPEND) + flags |= S_APPEND; + if (xflags & FS_XFLAG_SYNC) + flags |= S_SYNC; + if (xflags & FS_XFLAG_NOATIME) + flags |= S_NOATIME; + if (init && xfs_inode_should_enable_dax(ip)) + flags |= S_DAX; + + /* + * S_DAX can only be set during inode initialization and is never set by + * the VFS, so we cannot mask off S_DAX in i_flags. + */ + inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | S_NOATIME); + inode->i_flags |= flags; } /* @@ -1305,7 +1324,7 @@ xfs_setup_inode( inode_fake_hash(inode); i_size_write(inode, ip->i_d.di_size); - xfs_diflags_to_iflags(inode, ip); + xfs_diflags_to_iflags(ip, true); if (S_ISDIR(inode->i_mode)) { /* diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index ff2da28fed90..16ca97a7ff00 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -104,9 +104,9 @@ xfs_bulkstat_one_int( buf->bs_xflags = xfs_ip2xflags(ip); buf->bs_extsize_blks = dic->di_extsize; - buf->bs_extents = dic->di_nextents; + buf->bs_extents = xfs_ifork_nextents(&ip->i_df); xfs_bulkstat_health(ip, buf); - buf->bs_aextents = dic->di_anextents; + buf->bs_aextents = xfs_ifork_nextents(ip->i_afp); buf->bs_forkoff = XFS_IFORK_BOFF(ip); buf->bs_version = XFS_BULKSTAT_VERSION_V5; @@ -115,7 +115,7 @@ xfs_bulkstat_one_int( buf->bs_cowextsize_blks = dic->di_cowextsize; } - switch (dic->di_format) { + switch (ip->i_df.if_format) { case XFS_DINODE_FMT_DEV: buf->bs_rdev = sysv_encode_dev(inode->i_rdev); buf->bs_blksize = BLKDEV_IOSIZE; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 11c3502b07b1..ec015df55b77 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -18,21 +18,13 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" -#include "xfs_inode_item.h" -#include "xfs_extfree_item.h" #include "xfs_trans_priv.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" -#include "xfs_quota.h" #include "xfs_trace.h" #include "xfs_icache.h" -#include "xfs_bmap_btree.h" #include "xfs_error.h" -#include "xfs_dir2.h" -#include "xfs_rmap_item.h" #include "xfs_buf_item.h" -#include "xfs_refcount_item.h" -#include "xfs_bmap_item.h" #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) @@ -56,17 +48,6 @@ xlog_do_recovery_pass( struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *); /* - * This structure is used during recovery to record the buf log items which - * have been canceled and should not be replayed. - */ -struct xfs_buf_cancel { - xfs_daddr_t bc_blkno; - uint bc_len; - int bc_refcount; - struct list_head bc_list; -}; - -/* * Sector aligned buffer routines for buffer create/read/write/access */ @@ -284,7 +265,7 @@ xlog_header_check_mount( return 0; } -STATIC void +void xlog_recover_iodone( struct xfs_buf *bp) { @@ -1779,12 +1760,72 @@ xlog_clear_stale_blocks( return 0; } +/* + * Release the recovered intent item in the AIL that matches the given intent + * type and intent id. + */ +void +xlog_recover_release_intent( + struct xlog *log, + unsigned short intent_type, + uint64_t intent_id) +{ + struct xfs_ail_cursor cur; + struct xfs_log_item *lip; + struct xfs_ail *ailp = log->l_ailp; + + spin_lock(&ailp->ail_lock); + for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; + lip = xfs_trans_ail_cursor_next(ailp, &cur)) { + if (lip->li_type != intent_type) + continue; + if (!lip->li_ops->iop_match(lip, intent_id)) + continue; + + spin_unlock(&ailp->ail_lock); + lip->li_ops->iop_release(lip); + spin_lock(&ailp->ail_lock); + break; + } + + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->ail_lock); +} + /****************************************************************************** * * Log recover routines * ****************************************************************************** */ +static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = { + &xlog_buf_item_ops, + &xlog_inode_item_ops, + &xlog_dquot_item_ops, + &xlog_quotaoff_item_ops, + &xlog_icreate_item_ops, + &xlog_efi_item_ops, + &xlog_efd_item_ops, + &xlog_rui_item_ops, + &xlog_rud_item_ops, + &xlog_cui_item_ops, + &xlog_cud_item_ops, + &xlog_bui_item_ops, + &xlog_bud_item_ops, +}; + +static const struct xlog_recover_item_ops * +xlog_find_item_ops( + struct xlog_recover_item *item) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(xlog_recover_item_ops); i++) + if (ITEM_TYPE(item) == xlog_recover_item_ops[i]->item_type) + return xlog_recover_item_ops[i]; + + return NULL; +} /* * Sort the log items in the transaction. @@ -1841,54 +1882,23 @@ xlog_recover_reorder_trans( struct xlog_recover *trans, int pass) { - xlog_recover_item_t *item, *n; + struct xlog_recover_item *item, *n; int error = 0; LIST_HEAD(sort_list); LIST_HEAD(cancel_list); LIST_HEAD(buffer_list); LIST_HEAD(inode_buffer_list); - LIST_HEAD(inode_list); + LIST_HEAD(item_list); list_splice_init(&trans->r_itemq, &sort_list); list_for_each_entry_safe(item, n, &sort_list, ri_list) { - xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + enum xlog_recover_reorder fate = XLOG_REORDER_ITEM_LIST; - switch (ITEM_TYPE(item)) { - case XFS_LI_ICREATE: - list_move_tail(&item->ri_list, &buffer_list); - break; - case XFS_LI_BUF: - if (buf_f->blf_flags & XFS_BLF_CANCEL) { - trace_xfs_log_recover_item_reorder_head(log, - trans, item, pass); - list_move(&item->ri_list, &cancel_list); - break; - } - if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { - list_move(&item->ri_list, &inode_buffer_list); - break; - } - list_move_tail(&item->ri_list, &buffer_list); - break; - case XFS_LI_INODE: - case XFS_LI_DQUOT: - case XFS_LI_QUOTAOFF: - case XFS_LI_EFD: - case XFS_LI_EFI: - case XFS_LI_RUI: - case XFS_LI_RUD: - case XFS_LI_CUI: - case XFS_LI_CUD: - case XFS_LI_BUI: - case XFS_LI_BUD: - trace_xfs_log_recover_item_reorder_tail(log, - trans, item, pass); - list_move_tail(&item->ri_list, &inode_list); - break; - default: + item->ri_ops = xlog_find_item_ops(item); + if (!item->ri_ops) { xfs_warn(log->l_mp, - "%s: unrecognized type of log operation", - __func__); + "%s: unrecognized type of log operation (%d)", + __func__, ITEM_TYPE(item)); ASSERT(0); /* * return the remaining items back to the transaction @@ -1896,16 +1906,38 @@ xlog_recover_reorder_trans( */ if (!list_empty(&sort_list)) list_splice_init(&sort_list, &trans->r_itemq); - error = -EIO; - goto out; + error = -EFSCORRUPTED; + break; + } + + if (item->ri_ops->reorder) + fate = item->ri_ops->reorder(item); + + switch (fate) { + case XLOG_REORDER_BUFFER_LIST: + list_move_tail(&item->ri_list, &buffer_list); + break; + case XLOG_REORDER_CANCEL_LIST: + trace_xfs_log_recover_item_reorder_head(log, + trans, item, pass); + list_move(&item->ri_list, &cancel_list); + break; + case XLOG_REORDER_INODE_BUFFER_LIST: + list_move(&item->ri_list, &inode_buffer_list); + break; + case XLOG_REORDER_ITEM_LIST: + trace_xfs_log_recover_item_reorder_tail(log, + trans, item, pass); + list_move_tail(&item->ri_list, &item_list); + break; } } -out: + ASSERT(list_empty(&sort_list)); if (!list_empty(&buffer_list)) list_splice(&buffer_list, &trans->r_itemq); - if (!list_empty(&inode_list)) - list_splice_tail(&inode_list, &trans->r_itemq); + if (!list_empty(&item_list)) + list_splice_tail(&item_list, &trans->r_itemq); if (!list_empty(&inode_buffer_list)) list_splice_tail(&inode_buffer_list, &trans->r_itemq); if (!list_empty(&cancel_list)) @@ -1913,2152 +1945,15 @@ out: return error; } -/* - * Build up the table of buf cancel records so that we don't replay - * cancelled data in the second pass. For buffer records that are - * not cancel records, there is nothing to do here so we just return. - * - * If we get a cancel record which is already in the table, this indicates - * that the buffer was cancelled multiple times. In order to ensure - * that during pass 2 we keep the record in the table until we reach its - * last occurrence in the log, we keep a reference count in the cancel - * record in the table to tell us how many times we expect to see this - * record during the second pass. - */ -STATIC int -xlog_recover_buffer_pass1( - struct xlog *log, - struct xlog_recover_item *item) -{ - xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; - struct list_head *bucket; - struct xfs_buf_cancel *bcp; - - if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { - xfs_err(log->l_mp, "bad buffer log item size (%d)", - item->ri_buf[0].i_len); - return -EFSCORRUPTED; - } - - /* - * If this isn't a cancel buffer item, then just return. - */ - if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { - trace_xfs_log_recover_buf_not_cancel(log, buf_f); - return 0; - } - - /* - * Insert an xfs_buf_cancel record into the hash table of them. - * If there is already an identical record, bump its reference count. - */ - bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno); - list_for_each_entry(bcp, bucket, bc_list) { - if (bcp->bc_blkno == buf_f->blf_blkno && - bcp->bc_len == buf_f->blf_len) { - bcp->bc_refcount++; - trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); - return 0; - } - } - - bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0); - bcp->bc_blkno = buf_f->blf_blkno; - bcp->bc_len = buf_f->blf_len; - bcp->bc_refcount = 1; - list_add_tail(&bcp->bc_list, bucket); - - trace_xfs_log_recover_buf_cancel_add(log, buf_f); - return 0; -} - -/* - * Check to see whether the buffer being recovered has a corresponding - * entry in the buffer cancel record table. If it is, return the cancel - * buffer structure to the caller. - */ -STATIC struct xfs_buf_cancel * -xlog_peek_buffer_cancelled( - struct xlog *log, - xfs_daddr_t blkno, - uint len, - unsigned short flags) -{ - struct list_head *bucket; - struct xfs_buf_cancel *bcp; - - if (!log->l_buf_cancel_table) { - /* empty table means no cancelled buffers in the log */ - ASSERT(!(flags & XFS_BLF_CANCEL)); - return NULL; - } - - bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); - list_for_each_entry(bcp, bucket, bc_list) { - if (bcp->bc_blkno == blkno && bcp->bc_len == len) - return bcp; - } - - /* - * We didn't find a corresponding entry in the table, so return 0 so - * that the buffer is NOT cancelled. - */ - ASSERT(!(flags & XFS_BLF_CANCEL)); - return NULL; -} - -/* - * If the buffer is being cancelled then return 1 so that it will be cancelled, - * otherwise return 0. If the buffer is actually a buffer cancel item - * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the - * table and remove it from the table if this is the last reference. - * - * We remove the cancel record from the table when we encounter its last - * occurrence in the log so that if the same buffer is re-used again after its - * last cancellation we actually replay the changes made at that point. - */ -STATIC int -xlog_check_buffer_cancelled( +void +xlog_buf_readahead( struct xlog *log, xfs_daddr_t blkno, uint len, - unsigned short flags) -{ - struct xfs_buf_cancel *bcp; - - bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags); - if (!bcp) - return 0; - - /* - * We've go a match, so return 1 so that the recovery of this buffer - * is cancelled. If this buffer is actually a buffer cancel log - * item, then decrement the refcount on the one in the table and - * remove it if this is the last reference. - */ - if (flags & XFS_BLF_CANCEL) { - if (--bcp->bc_refcount == 0) { - list_del(&bcp->bc_list); - kmem_free(bcp); - } - } - return 1; -} - -/* - * Perform recovery for a buffer full of inodes. In these buffers, the only - * data which should be recovered is that which corresponds to the - * di_next_unlinked pointers in the on disk inode structures. The rest of the - * data for the inodes is always logged through the inodes themselves rather - * than the inode buffer and is recovered in xlog_recover_inode_pass2(). - * - * The only time when buffers full of inodes are fully recovered is when the - * buffer is full of newly allocated inodes. In this case the buffer will - * not be marked as an inode buffer and so will be sent to - * xlog_recover_do_reg_buffer() below during recovery. - */ -STATIC int -xlog_recover_do_inode_buffer( - struct xfs_mount *mp, - xlog_recover_item_t *item, - struct xfs_buf *bp, - xfs_buf_log_format_t *buf_f) -{ - int i; - int item_index = 0; - int bit = 0; - int nbits = 0; - int reg_buf_offset = 0; - int reg_buf_bytes = 0; - int next_unlinked_offset; - int inodes_per_buf; - xfs_agino_t *logged_nextp; - xfs_agino_t *buffer_nextp; - - trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); - - /* - * Post recovery validation only works properly on CRC enabled - * filesystems. - */ - if (xfs_sb_version_hascrc(&mp->m_sb)) - bp->b_ops = &xfs_inode_buf_ops; - - inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog; - for (i = 0; i < inodes_per_buf; i++) { - next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + - offsetof(xfs_dinode_t, di_next_unlinked); - - while (next_unlinked_offset >= - (reg_buf_offset + reg_buf_bytes)) { - /* - * The next di_next_unlinked field is beyond - * the current logged region. Find the next - * logged region that contains or is beyond - * the current di_next_unlinked field. - */ - bit += nbits; - bit = xfs_next_bit(buf_f->blf_data_map, - buf_f->blf_map_size, bit); - - /* - * If there are no more logged regions in the - * buffer, then we're done. - */ - if (bit == -1) - return 0; - - nbits = xfs_contig_bits(buf_f->blf_data_map, - buf_f->blf_map_size, bit); - ASSERT(nbits > 0); - reg_buf_offset = bit << XFS_BLF_SHIFT; - reg_buf_bytes = nbits << XFS_BLF_SHIFT; - item_index++; - } - - /* - * If the current logged region starts after the current - * di_next_unlinked field, then move on to the next - * di_next_unlinked field. - */ - if (next_unlinked_offset < reg_buf_offset) - continue; - - ASSERT(item->ri_buf[item_index].i_addr != NULL); - ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); - ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length)); - - /* - * The current logged region contains a copy of the - * current di_next_unlinked field. Extract its value - * and copy it to the buffer copy. - */ - logged_nextp = item->ri_buf[item_index].i_addr + - next_unlinked_offset - reg_buf_offset; - if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) { - xfs_alert(mp, - "Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). " - "Trying to replay bad (0) inode di_next_unlinked field.", - item, bp); - return -EFSCORRUPTED; - } - - buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset); - *buffer_nextp = *logged_nextp; - - /* - * If necessary, recalculate the CRC in the on-disk inode. We - * have to leave the inode in a consistent state for whoever - * reads it next.... - */ - xfs_dinode_calc_crc(mp, - xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); - - } - - return 0; -} - -/* - * V5 filesystems know the age of the buffer on disk being recovered. We can - * have newer objects on disk than we are replaying, and so for these cases we - * don't want to replay the current change as that will make the buffer contents - * temporarily invalid on disk. - * - * The magic number might not match the buffer type we are going to recover - * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence - * extract the LSN of the existing object in the buffer based on it's current - * magic number. If we don't recognise the magic number in the buffer, then - * return a LSN of -1 so that the caller knows it was an unrecognised block and - * so can recover the buffer. - * - * Note: we cannot rely solely on magic number matches to determine that the - * buffer has a valid LSN - we also need to verify that it belongs to this - * filesystem, so we need to extract the object's LSN and compare it to that - * which we read from the superblock. If the UUIDs don't match, then we've got a - * stale metadata block from an old filesystem instance that we need to recover - * over the top of. - */ -static xfs_lsn_t -xlog_recover_get_buf_lsn( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - uint32_t magic32; - uint16_t magic16; - uint16_t magicda; - void *blk = bp->b_addr; - uuid_t *uuid; - xfs_lsn_t lsn = -1; - - /* v4 filesystems always recover immediately */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) - goto recover_immediately; - - magic32 = be32_to_cpu(*(__be32 *)blk); - switch (magic32) { - case XFS_ABTB_CRC_MAGIC: - case XFS_ABTC_CRC_MAGIC: - case XFS_ABTB_MAGIC: - case XFS_ABTC_MAGIC: - case XFS_RMAP_CRC_MAGIC: - case XFS_REFC_CRC_MAGIC: - case XFS_IBT_CRC_MAGIC: - case XFS_IBT_MAGIC: { - struct xfs_btree_block *btb = blk; - - lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); - uuid = &btb->bb_u.s.bb_uuid; - break; - } - case XFS_BMAP_CRC_MAGIC: - case XFS_BMAP_MAGIC: { - struct xfs_btree_block *btb = blk; - - lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); - uuid = &btb->bb_u.l.bb_uuid; - break; - } - case XFS_AGF_MAGIC: - lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); - uuid = &((struct xfs_agf *)blk)->agf_uuid; - break; - case XFS_AGFL_MAGIC: - lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); - uuid = &((struct xfs_agfl *)blk)->agfl_uuid; - break; - case XFS_AGI_MAGIC: - lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); - uuid = &((struct xfs_agi *)blk)->agi_uuid; - break; - case XFS_SYMLINK_MAGIC: - lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); - uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; - break; - case XFS_DIR3_BLOCK_MAGIC: - case XFS_DIR3_DATA_MAGIC: - case XFS_DIR3_FREE_MAGIC: - lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); - uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; - break; - case XFS_ATTR3_RMT_MAGIC: - /* - * Remote attr blocks are written synchronously, rather than - * being logged. That means they do not contain a valid LSN - * (i.e. transactionally ordered) in them, and hence any time we - * see a buffer to replay over the top of a remote attribute - * block we should simply do so. - */ - goto recover_immediately; - case XFS_SB_MAGIC: - /* - * superblock uuids are magic. We may or may not have a - * sb_meta_uuid on disk, but it will be set in the in-core - * superblock. We set the uuid pointer for verification - * according to the superblock feature mask to ensure we check - * the relevant UUID in the superblock. - */ - lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); - if (xfs_sb_version_hasmetauuid(&mp->m_sb)) - uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; - else - uuid = &((struct xfs_dsb *)blk)->sb_uuid; - break; - default: - break; - } - - if (lsn != (xfs_lsn_t)-1) { - if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid)) - goto recover_immediately; - return lsn; - } - - magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); - switch (magicda) { - case XFS_DIR3_LEAF1_MAGIC: - case XFS_DIR3_LEAFN_MAGIC: - case XFS_DA3_NODE_MAGIC: - lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); - uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; - break; - default: - break; - } - - if (lsn != (xfs_lsn_t)-1) { - if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) - goto recover_immediately; - return lsn; - } - - /* - * We do individual object checks on dquot and inode buffers as they - * have their own individual LSN records. Also, we could have a stale - * buffer here, so we have to at least recognise these buffer types. - * - * A notd complexity here is inode unlinked list processing - it logs - * the inode directly in the buffer, but we don't know which inodes have - * been modified, and there is no global buffer LSN. Hence we need to - * recover all inode buffer types immediately. This problem will be - * fixed by logical logging of the unlinked list modifications. - */ - magic16 = be16_to_cpu(*(__be16 *)blk); - switch (magic16) { - case XFS_DQUOT_MAGIC: - case XFS_DINODE_MAGIC: - goto recover_immediately; - default: - break; - } - - /* unknown buffer contents, recover immediately */ - -recover_immediately: - return (xfs_lsn_t)-1; - -} - -/* - * Validate the recovered buffer is of the correct type and attach the - * appropriate buffer operations to them for writeback. Magic numbers are in a - * few places: - * the first 16 bits of the buffer (inode buffer, dquot buffer), - * the first 32 bits of the buffer (most blocks), - * inside a struct xfs_da_blkinfo at the start of the buffer. - */ -static void -xlog_recover_validate_buf_type( - struct xfs_mount *mp, - struct xfs_buf *bp, - xfs_buf_log_format_t *buf_f, - xfs_lsn_t current_lsn) -{ - struct xfs_da_blkinfo *info = bp->b_addr; - uint32_t magic32; - uint16_t magic16; - uint16_t magicda; - char *warnmsg = NULL; - - /* - * We can only do post recovery validation on items on CRC enabled - * fielsystems as we need to know when the buffer was written to be able - * to determine if we should have replayed the item. If we replay old - * metadata over a newer buffer, then it will enter a temporarily - * inconsistent state resulting in verification failures. Hence for now - * just avoid the verification stage for non-crc filesystems - */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return; - - magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); - magic16 = be16_to_cpu(*(__be16*)bp->b_addr); - magicda = be16_to_cpu(info->magic); - switch (xfs_blft_from_flags(buf_f)) { - case XFS_BLFT_BTREE_BUF: - switch (magic32) { - case XFS_ABTB_CRC_MAGIC: - case XFS_ABTB_MAGIC: - bp->b_ops = &xfs_bnobt_buf_ops; - break; - case XFS_ABTC_CRC_MAGIC: - case XFS_ABTC_MAGIC: - bp->b_ops = &xfs_cntbt_buf_ops; - break; - case XFS_IBT_CRC_MAGIC: - case XFS_IBT_MAGIC: - bp->b_ops = &xfs_inobt_buf_ops; - break; - case XFS_FIBT_CRC_MAGIC: - case XFS_FIBT_MAGIC: - bp->b_ops = &xfs_finobt_buf_ops; - break; - case XFS_BMAP_CRC_MAGIC: - case XFS_BMAP_MAGIC: - bp->b_ops = &xfs_bmbt_buf_ops; - break; - case XFS_RMAP_CRC_MAGIC: - bp->b_ops = &xfs_rmapbt_buf_ops; - break; - case XFS_REFC_CRC_MAGIC: - bp->b_ops = &xfs_refcountbt_buf_ops; - break; - default: - warnmsg = "Bad btree block magic!"; - break; - } - break; - case XFS_BLFT_AGF_BUF: - if (magic32 != XFS_AGF_MAGIC) { - warnmsg = "Bad AGF block magic!"; - break; - } - bp->b_ops = &xfs_agf_buf_ops; - break; - case XFS_BLFT_AGFL_BUF: - if (magic32 != XFS_AGFL_MAGIC) { - warnmsg = "Bad AGFL block magic!"; - break; - } - bp->b_ops = &xfs_agfl_buf_ops; - break; - case XFS_BLFT_AGI_BUF: - if (magic32 != XFS_AGI_MAGIC) { - warnmsg = "Bad AGI block magic!"; - break; - } - bp->b_ops = &xfs_agi_buf_ops; - break; - case XFS_BLFT_UDQUOT_BUF: - case XFS_BLFT_PDQUOT_BUF: - case XFS_BLFT_GDQUOT_BUF: -#ifdef CONFIG_XFS_QUOTA - if (magic16 != XFS_DQUOT_MAGIC) { - warnmsg = "Bad DQUOT block magic!"; - break; - } - bp->b_ops = &xfs_dquot_buf_ops; -#else - xfs_alert(mp, - "Trying to recover dquots without QUOTA support built in!"); - ASSERT(0); -#endif - break; - case XFS_BLFT_DINO_BUF: - if (magic16 != XFS_DINODE_MAGIC) { - warnmsg = "Bad INODE block magic!"; - break; - } - bp->b_ops = &xfs_inode_buf_ops; - break; - case XFS_BLFT_SYMLINK_BUF: - if (magic32 != XFS_SYMLINK_MAGIC) { - warnmsg = "Bad symlink block magic!"; - break; - } - bp->b_ops = &xfs_symlink_buf_ops; - break; - case XFS_BLFT_DIR_BLOCK_BUF: - if (magic32 != XFS_DIR2_BLOCK_MAGIC && - magic32 != XFS_DIR3_BLOCK_MAGIC) { - warnmsg = "Bad dir block magic!"; - break; - } - bp->b_ops = &xfs_dir3_block_buf_ops; - break; - case XFS_BLFT_DIR_DATA_BUF: - if (magic32 != XFS_DIR2_DATA_MAGIC && - magic32 != XFS_DIR3_DATA_MAGIC) { - warnmsg = "Bad dir data magic!"; - break; - } - bp->b_ops = &xfs_dir3_data_buf_ops; - break; - case XFS_BLFT_DIR_FREE_BUF: - if (magic32 != XFS_DIR2_FREE_MAGIC && - magic32 != XFS_DIR3_FREE_MAGIC) { - warnmsg = "Bad dir3 free magic!"; - break; - } - bp->b_ops = &xfs_dir3_free_buf_ops; - break; - case XFS_BLFT_DIR_LEAF1_BUF: - if (magicda != XFS_DIR2_LEAF1_MAGIC && - magicda != XFS_DIR3_LEAF1_MAGIC) { - warnmsg = "Bad dir leaf1 magic!"; - break; - } - bp->b_ops = &xfs_dir3_leaf1_buf_ops; - break; - case XFS_BLFT_DIR_LEAFN_BUF: - if (magicda != XFS_DIR2_LEAFN_MAGIC && - magicda != XFS_DIR3_LEAFN_MAGIC) { - warnmsg = "Bad dir leafn magic!"; - break; - } - bp->b_ops = &xfs_dir3_leafn_buf_ops; - break; - case XFS_BLFT_DA_NODE_BUF: - if (magicda != XFS_DA_NODE_MAGIC && - magicda != XFS_DA3_NODE_MAGIC) { - warnmsg = "Bad da node magic!"; - break; - } - bp->b_ops = &xfs_da3_node_buf_ops; - break; - case XFS_BLFT_ATTR_LEAF_BUF: - if (magicda != XFS_ATTR_LEAF_MAGIC && - magicda != XFS_ATTR3_LEAF_MAGIC) { - warnmsg = "Bad attr leaf magic!"; - break; - } - bp->b_ops = &xfs_attr3_leaf_buf_ops; - break; - case XFS_BLFT_ATTR_RMT_BUF: - if (magic32 != XFS_ATTR3_RMT_MAGIC) { - warnmsg = "Bad attr remote magic!"; - break; - } - bp->b_ops = &xfs_attr3_rmt_buf_ops; - break; - case XFS_BLFT_SB_BUF: - if (magic32 != XFS_SB_MAGIC) { - warnmsg = "Bad SB block magic!"; - break; - } - bp->b_ops = &xfs_sb_buf_ops; - break; -#ifdef CONFIG_XFS_RT - case XFS_BLFT_RTBITMAP_BUF: - case XFS_BLFT_RTSUMMARY_BUF: - /* no magic numbers for verification of RT buffers */ - bp->b_ops = &xfs_rtbuf_ops; - break; -#endif /* CONFIG_XFS_RT */ - default: - xfs_warn(mp, "Unknown buffer type %d!", - xfs_blft_from_flags(buf_f)); - break; - } - - /* - * Nothing else to do in the case of a NULL current LSN as this means - * the buffer is more recent than the change in the log and will be - * skipped. - */ - if (current_lsn == NULLCOMMITLSN) - return; - - if (warnmsg) { - xfs_warn(mp, warnmsg); - ASSERT(0); - } - - /* - * We must update the metadata LSN of the buffer as it is written out to - * ensure that older transactions never replay over this one and corrupt - * the buffer. This can occur if log recovery is interrupted at some - * point after the current transaction completes, at which point a - * subsequent mount starts recovery from the beginning. - * - * Write verifiers update the metadata LSN from log items attached to - * the buffer. Therefore, initialize a bli purely to carry the LSN to - * the verifier. We'll clean it up in our ->iodone() callback. - */ - if (bp->b_ops) { - struct xfs_buf_log_item *bip; - - ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone); - bp->b_iodone = xlog_recover_iodone; - xfs_buf_item_init(bp, mp); - bip = bp->b_log_item; - bip->bli_item.li_lsn = current_lsn; - } -} - -/* - * Perform a 'normal' buffer recovery. Each logged region of the - * buffer should be copied over the corresponding region in the - * given buffer. The bitmap in the buf log format structure indicates - * where to place the logged data. - */ -STATIC void -xlog_recover_do_reg_buffer( - struct xfs_mount *mp, - xlog_recover_item_t *item, - struct xfs_buf *bp, - xfs_buf_log_format_t *buf_f, - xfs_lsn_t current_lsn) -{ - int i; - int bit; - int nbits; - xfs_failaddr_t fa; - const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot); - - trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); - - bit = 0; - i = 1; /* 0 is the buf format structure */ - while (1) { - bit = xfs_next_bit(buf_f->blf_data_map, - buf_f->blf_map_size, bit); - if (bit == -1) - break; - nbits = xfs_contig_bits(buf_f->blf_data_map, - buf_f->blf_map_size, bit); - ASSERT(nbits > 0); - ASSERT(item->ri_buf[i].i_addr != NULL); - ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); - ASSERT(BBTOB(bp->b_length) >= - ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); - - /* - * The dirty regions logged in the buffer, even though - * contiguous, may span multiple chunks. This is because the - * dirty region may span a physical page boundary in a buffer - * and hence be split into two separate vectors for writing into - * the log. Hence we need to trim nbits back to the length of - * the current region being copied out of the log. - */ - if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) - nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; - - /* - * Do a sanity check if this is a dquot buffer. Just checking - * the first dquot in the buffer should do. XXXThis is - * probably a good thing to do for other buf types also. - */ - fa = NULL; - if (buf_f->blf_flags & - (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { - if (item->ri_buf[i].i_addr == NULL) { - xfs_alert(mp, - "XFS: NULL dquot in %s.", __func__); - goto next; - } - if (item->ri_buf[i].i_len < size_disk_dquot) { - xfs_alert(mp, - "XFS: dquot too small (%d) in %s.", - item->ri_buf[i].i_len, __func__); - goto next; - } - fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, - -1, 0); - if (fa) { - xfs_alert(mp, - "dquot corrupt at %pS trying to replay into block 0x%llx", - fa, bp->b_bn); - goto next; - } - } - - memcpy(xfs_buf_offset(bp, - (uint)bit << XFS_BLF_SHIFT), /* dest */ - item->ri_buf[i].i_addr, /* source */ - nbits<<XFS_BLF_SHIFT); /* length */ - next: - i++; - bit += nbits; - } - - /* Shouldn't be any more regions */ - ASSERT(i == item->ri_total); - - xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); -} - -/* - * Perform a dquot buffer recovery. - * Simple algorithm: if we have found a QUOTAOFF log item of the same type - * (ie. USR or GRP), then just toss this buffer away; don't recover it. - * Else, treat it as a regular buffer and do recovery. - * - * Return false if the buffer was tossed and true if we recovered the buffer to - * indicate to the caller if the buffer needs writing. - */ -STATIC bool -xlog_recover_do_dquot_buffer( - struct xfs_mount *mp, - struct xlog *log, - struct xlog_recover_item *item, - struct xfs_buf *bp, - struct xfs_buf_log_format *buf_f) -{ - uint type; - - trace_xfs_log_recover_buf_dquot_buf(log, buf_f); - - /* - * Filesystems are required to send in quota flags at mount time. - */ - if (!mp->m_qflags) - return false; - - type = 0; - if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) - type |= XFS_DQ_USER; - if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) - type |= XFS_DQ_PROJ; - if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) - type |= XFS_DQ_GROUP; - /* - * This type of quotas was turned off, so ignore this buffer - */ - if (log->l_quotaoffs_flag & type) - return false; - - xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN); - return true; -} - -/* - * This routine replays a modification made to a buffer at runtime. - * There are actually two types of buffer, regular and inode, which - * are handled differently. Inode buffers are handled differently - * in that we only recover a specific set of data from them, namely - * the inode di_next_unlinked fields. This is because all other inode - * data is actually logged via inode records and any data we replay - * here which overlaps that may be stale. - * - * When meta-data buffers are freed at run time we log a buffer item - * with the XFS_BLF_CANCEL bit set to indicate that previous copies - * of the buffer in the log should not be replayed at recovery time. - * This is so that if the blocks covered by the buffer are reused for - * file data before we crash we don't end up replaying old, freed - * meta-data into a user's file. - * - * To handle the cancellation of buffer log items, we make two passes - * over the log during recovery. During the first we build a table of - * those buffers which have been cancelled, and during the second we - * only replay those buffers which do not have corresponding cancel - * records in the table. See xlog_recover_buffer_pass[1,2] above - * for more details on the implementation of the table of cancel records. - */ -STATIC int -xlog_recover_buffer_pass2( - struct xlog *log, - struct list_head *buffer_list, - struct xlog_recover_item *item, - xfs_lsn_t current_lsn) -{ - xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; - xfs_mount_t *mp = log->l_mp; - xfs_buf_t *bp; - int error; - uint buf_flags; - xfs_lsn_t lsn; - - /* - * In this pass we only want to recover all the buffers which have - * not been cancelled and are not cancellation buffers themselves. - */ - if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno, - buf_f->blf_len, buf_f->blf_flags)) { - trace_xfs_log_recover_buf_cancel(log, buf_f); - return 0; - } - - trace_xfs_log_recover_buf_recover(log, buf_f); - - buf_flags = 0; - if (buf_f->blf_flags & XFS_BLF_INODE_BUF) - buf_flags |= XBF_UNMAPPED; - - error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, - buf_flags, &bp, NULL); - if (error) - return error; - - /* - * Recover the buffer only if we get an LSN from it and it's less than - * the lsn of the transaction we are replaying. - * - * Note that we have to be extremely careful of readahead here. - * Readahead does not attach verfiers to the buffers so if we don't - * actually do any replay after readahead because of the LSN we found - * in the buffer if more recent than that current transaction then we - * need to attach the verifier directly. Failure to do so can lead to - * future recovery actions (e.g. EFI and unlinked list recovery) can - * operate on the buffers and they won't get the verifier attached. This - * can lead to blocks on disk having the correct content but a stale - * CRC. - * - * It is safe to assume these clean buffers are currently up to date. - * If the buffer is dirtied by a later transaction being replayed, then - * the verifier will be reset to match whatever recover turns that - * buffer into. - */ - lsn = xlog_recover_get_buf_lsn(mp, bp); - if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { - trace_xfs_log_recover_buf_skip(log, buf_f); - xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); - goto out_release; - } - - if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { - error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); - if (error) - goto out_release; - } else if (buf_f->blf_flags & - (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { - bool dirty; - - dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); - if (!dirty) - goto out_release; - } else { - xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); - } - - /* - * Perform delayed write on the buffer. Asynchronous writes will be - * slower when taking into account all the buffers to be flushed. - * - * Also make sure that only inode buffers with good sizes stay in - * the buffer cache. The kernel moves inodes in buffers of 1 block - * or inode_cluster_size bytes, whichever is bigger. The inode - * buffers in the log can be a different size if the log was generated - * by an older kernel using unclustered inode buffers or a newer kernel - * running with a different inode cluster size. Regardless, if the - * the inode buffer size isn't max(blocksize, inode_cluster_size) - * for *our* value of inode_cluster_size, then we need to keep - * the buffer out of the buffer cache so that the buffer won't - * overlap with future reads of those inodes. - */ - if (XFS_DINODE_MAGIC == - be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && - (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) { - xfs_buf_stale(bp); - error = xfs_bwrite(bp); - } else { - ASSERT(bp->b_mount == mp); - bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp, buffer_list); - } - -out_release: - xfs_buf_relse(bp); - return error; -} - -/* - * Inode fork owner changes - * - * If we have been told that we have to reparent the inode fork, it's because an - * extent swap operation on a CRC enabled filesystem has been done and we are - * replaying it. We need to walk the BMBT of the appropriate fork and change the - * owners of it. - * - * The complexity here is that we don't have an inode context to work with, so - * after we've replayed the inode we need to instantiate one. This is where the - * fun begins. - * - * We are in the middle of log recovery, so we can't run transactions. That - * means we cannot use cache coherent inode instantiation via xfs_iget(), as - * that will result in the corresponding iput() running the inode through - * xfs_inactive(). If we've just replayed an inode core that changes the link - * count to zero (i.e. it's been unlinked), then xfs_inactive() will run - * transactions (bad!). - * - * So, to avoid this, we instantiate an inode directly from the inode core we've - * just recovered. We have the buffer still locked, and all we really need to - * instantiate is the inode core and the forks being modified. We can do this - * manually, then run the inode btree owner change, and then tear down the - * xfs_inode without having to run any transactions at all. - * - * Also, because we don't have a transaction context available here but need to - * gather all the buffers we modify for writeback so we pass the buffer_list - * instead for the operation to use. - */ - -STATIC int -xfs_recover_inode_owner_change( - struct xfs_mount *mp, - struct xfs_dinode *dip, - struct xfs_inode_log_format *in_f, - struct list_head *buffer_list) + const struct xfs_buf_ops *ops) { - struct xfs_inode *ip; - int error; - - ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); - - ip = xfs_inode_alloc(mp, in_f->ilf_ino); - if (!ip) - return -ENOMEM; - - /* instantiate the inode */ - ASSERT(dip->di_version >= 3); - xfs_inode_from_disk(ip, dip); - - error = xfs_iformat_fork(ip, dip); - if (error) - goto out_free_ip; - - if (!xfs_inode_verify_forks(ip)) { - error = -EFSCORRUPTED; - goto out_free_ip; - } - - if (in_f->ilf_fields & XFS_ILOG_DOWNER) { - ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); - error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, - ip->i_ino, buffer_list); - if (error) - goto out_free_ip; - } - - if (in_f->ilf_fields & XFS_ILOG_AOWNER) { - ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); - error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, - ip->i_ino, buffer_list); - if (error) - goto out_free_ip; - } - -out_free_ip: - xfs_inode_free(ip); - return error; -} - -STATIC int -xlog_recover_inode_pass2( - struct xlog *log, - struct list_head *buffer_list, - struct xlog_recover_item *item, - xfs_lsn_t current_lsn) -{ - struct xfs_inode_log_format *in_f; - xfs_mount_t *mp = log->l_mp; - xfs_buf_t *bp; - xfs_dinode_t *dip; - int len; - char *src; - char *dest; - int error; - int attr_index; - uint fields; - struct xfs_log_dinode *ldip; - uint isize; - int need_free = 0; - - if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { - in_f = item->ri_buf[0].i_addr; - } else { - in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0); - need_free = 1; - error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); - if (error) - goto error; - } - - /* - * Inode buffers can be freed, look out for it, - * and do not replay the inode. - */ - if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, - in_f->ilf_len, 0)) { - error = 0; - trace_xfs_log_recover_inode_cancel(log, in_f); - goto error; - } - trace_xfs_log_recover_inode_recover(log, in_f); - - error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, - 0, &bp, &xfs_inode_buf_ops); - if (error) - goto error; - ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); - dip = xfs_buf_offset(bp, in_f->ilf_boffset); - - /* - * Make sure the place we're flushing out to really looks - * like an inode! - */ - if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) { - xfs_alert(mp, - "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", - __func__, dip, bp, in_f->ilf_ino); - error = -EFSCORRUPTED; - goto out_release; - } - ldip = item->ri_buf[1].i_addr; - if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { - xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld", - __func__, item, in_f->ilf_ino); - error = -EFSCORRUPTED; - goto out_release; - } - - /* - * If the inode has an LSN in it, recover the inode only if it's less - * than the lsn of the transaction we are replaying. Note: we still - * need to replay an owner change even though the inode is more recent - * than the transaction as there is no guarantee that all the btree - * blocks are more recent than this transaction, too. - */ - if (dip->di_version >= 3) { - xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); - - if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { - trace_xfs_log_recover_inode_skip(log, in_f); - error = 0; - goto out_owner_change; - } - } - - /* - * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes - * are transactional and if ordering is necessary we can determine that - * more accurately by the LSN field in the V3 inode core. Don't trust - * the inode versions we might be changing them here - use the - * superblock flag to determine whether we need to look at di_flushiter - * to skip replay when the on disk inode is newer than the log one - */ - if (!xfs_sb_version_has_v3inode(&mp->m_sb) && - ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { - /* - * Deal with the wrap case, DI_MAX_FLUSH is less - * than smaller numbers - */ - if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && - ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { - /* do nothing */ - } else { - trace_xfs_log_recover_inode_skip(log, in_f); - error = 0; - goto out_release; - } - } - - /* Take the opportunity to reset the flush iteration count */ - ldip->di_flushiter = 0; - - if (unlikely(S_ISREG(ldip->di_mode))) { - if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && - (ldip->di_format != XFS_DINODE_FMT_BTREE)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); - xfs_alert(mp, - "%s: Bad regular inode log record, rec ptr "PTR_FMT", " - "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", - __func__, item, dip, bp, in_f->ilf_ino); - error = -EFSCORRUPTED; - goto out_release; - } - } else if (unlikely(S_ISDIR(ldip->di_mode))) { - if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && - (ldip->di_format != XFS_DINODE_FMT_BTREE) && - (ldip->di_format != XFS_DINODE_FMT_LOCAL)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); - xfs_alert(mp, - "%s: Bad dir inode log record, rec ptr "PTR_FMT", " - "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", - __func__, item, dip, bp, in_f->ilf_ino); - error = -EFSCORRUPTED; - goto out_release; - } - } - if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){ - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); - xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " - "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld", - __func__, item, dip, bp, in_f->ilf_ino, - ldip->di_nextents + ldip->di_anextents, - ldip->di_nblocks); - error = -EFSCORRUPTED; - goto out_release; - } - if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); - xfs_alert(mp, - "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " - "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__, - item, dip, bp, in_f->ilf_ino, ldip->di_forkoff); - error = -EFSCORRUPTED; - goto out_release; - } - isize = xfs_log_dinode_size(mp); - if (unlikely(item->ri_buf[1].i_len > isize)) { - XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", - XFS_ERRLEVEL_LOW, mp, ldip, - sizeof(*ldip)); - xfs_alert(mp, - "%s: Bad inode log record length %d, rec ptr "PTR_FMT, - __func__, item->ri_buf[1].i_len, item); - error = -EFSCORRUPTED; - goto out_release; - } - - /* recover the log dinode inode into the on disk inode */ - xfs_log_dinode_to_disk(ldip, dip); - - fields = in_f->ilf_fields; - if (fields & XFS_ILOG_DEV) - xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); - - if (in_f->ilf_size == 2) - goto out_owner_change; - len = item->ri_buf[2].i_len; - src = item->ri_buf[2].i_addr; - ASSERT(in_f->ilf_size <= 4); - ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); - ASSERT(!(fields & XFS_ILOG_DFORK) || - (len == in_f->ilf_dsize)); - - switch (fields & XFS_ILOG_DFORK) { - case XFS_ILOG_DDATA: - case XFS_ILOG_DEXT: - memcpy(XFS_DFORK_DPTR(dip), src, len); - break; - - case XFS_ILOG_DBROOT: - xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, - (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip), - XFS_DFORK_DSIZE(dip, mp)); - break; - - default: - /* - * There are no data fork flags set. - */ - ASSERT((fields & XFS_ILOG_DFORK) == 0); - break; - } - - /* - * If we logged any attribute data, recover it. There may or - * may not have been any other non-core data logged in this - * transaction. - */ - if (in_f->ilf_fields & XFS_ILOG_AFORK) { - if (in_f->ilf_fields & XFS_ILOG_DFORK) { - attr_index = 3; - } else { - attr_index = 2; - } - len = item->ri_buf[attr_index].i_len; - src = item->ri_buf[attr_index].i_addr; - ASSERT(len == in_f->ilf_asize); - - switch (in_f->ilf_fields & XFS_ILOG_AFORK) { - case XFS_ILOG_ADATA: - case XFS_ILOG_AEXT: - dest = XFS_DFORK_APTR(dip); - ASSERT(len <= XFS_DFORK_ASIZE(dip, mp)); - memcpy(dest, src, len); - break; - - case XFS_ILOG_ABROOT: - dest = XFS_DFORK_APTR(dip); - xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, - len, (xfs_bmdr_block_t*)dest, - XFS_DFORK_ASIZE(dip, mp)); - break; - - default: - xfs_warn(log->l_mp, "%s: Invalid flag", __func__); - ASSERT(0); - error = -EFSCORRUPTED; - goto out_release; - } - } - -out_owner_change: - /* Recover the swapext owner change unless inode has been deleted */ - if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) && - (dip->di_mode != 0)) - error = xfs_recover_inode_owner_change(mp, dip, in_f, - buffer_list); - /* re-generate the checksum. */ - xfs_dinode_calc_crc(log->l_mp, dip); - - ASSERT(bp->b_mount == mp); - bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp, buffer_list); - -out_release: - xfs_buf_relse(bp); -error: - if (need_free) - kmem_free(in_f); - return error; -} - -/* - * Recover QUOTAOFF records. We simply make a note of it in the xlog - * structure, so that we know not to do any dquot item or dquot buffer recovery, - * of that type. - */ -STATIC int -xlog_recover_quotaoff_pass1( - struct xlog *log, - struct xlog_recover_item *item) -{ - xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr; - ASSERT(qoff_f); - - /* - * The logitem format's flag tells us if this was user quotaoff, - * group/project quotaoff or both. - */ - if (qoff_f->qf_flags & XFS_UQUOTA_ACCT) - log->l_quotaoffs_flag |= XFS_DQ_USER; - if (qoff_f->qf_flags & XFS_PQUOTA_ACCT) - log->l_quotaoffs_flag |= XFS_DQ_PROJ; - if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) - log->l_quotaoffs_flag |= XFS_DQ_GROUP; - - return 0; -} - -/* - * Recover a dquot record - */ -STATIC int -xlog_recover_dquot_pass2( - struct xlog *log, - struct list_head *buffer_list, - struct xlog_recover_item *item, - xfs_lsn_t current_lsn) -{ - xfs_mount_t *mp = log->l_mp; - xfs_buf_t *bp; - struct xfs_disk_dquot *ddq, *recddq; - xfs_failaddr_t fa; - int error; - xfs_dq_logformat_t *dq_f; - uint type; - - - /* - * Filesystems are required to send in quota flags at mount time. - */ - if (mp->m_qflags == 0) - return 0; - - recddq = item->ri_buf[1].i_addr; - if (recddq == NULL) { - xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); - return -EFSCORRUPTED; - } - if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) { - xfs_alert(log->l_mp, "dquot too small (%d) in %s.", - item->ri_buf[1].i_len, __func__); - return -EFSCORRUPTED; - } - - /* - * This type of quotas was turned off, so ignore this record. - */ - type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); - ASSERT(type); - if (log->l_quotaoffs_flag & type) - return 0; - - /* - * At this point we know that quota was _not_ turned off. - * Since the mount flags are not indicating to us otherwise, this - * must mean that quota is on, and the dquot needs to be replayed. - * Remember that we may not have fully recovered the superblock yet, - * so we can't do the usual trick of looking at the SB quota bits. - * - * The other possibility, of course, is that the quota subsystem was - * removed since the last mount - ENOSYS. - */ - dq_f = item->ri_buf[0].i_addr; - ASSERT(dq_f); - fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0); - if (fa) { - xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS", - dq_f->qlf_id, fa); - return -EFSCORRUPTED; - } - ASSERT(dq_f->qlf_len == 1); - - /* - * At this point we are assuming that the dquots have been allocated - * and hence the buffer has valid dquots stamped in it. It should, - * therefore, pass verifier validation. If the dquot is bad, then the - * we'll return an error here, so we don't need to specifically check - * the dquot in the buffer after the verifier has run. - */ - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, - &xfs_dquot_buf_ops); - if (error) - return error; - - ASSERT(bp); - ddq = xfs_buf_offset(bp, dq_f->qlf_boffset); - - /* - * If the dquot has an LSN in it, recover the dquot only if it's less - * than the lsn of the transaction we are replaying. - */ - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; - xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); - - if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { - goto out_release; - } - } - - memcpy(ddq, recddq, item->ri_buf[1].i_len); - if (xfs_sb_version_hascrc(&mp->m_sb)) { - xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), - XFS_DQUOT_CRC_OFF); - } - - ASSERT(dq_f->qlf_size == 2); - ASSERT(bp->b_mount == mp); - bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp, buffer_list); - -out_release: - xfs_buf_relse(bp); - return 0; -} - -/* - * This routine is called to create an in-core extent free intent - * item from the efi format structure which was logged on disk. - * It allocates an in-core efi, copies the extents from the format - * structure into it, and adds the efi to the AIL with the given - * LSN. - */ -STATIC int -xlog_recover_efi_pass2( - struct xlog *log, - struct xlog_recover_item *item, - xfs_lsn_t lsn) -{ - int error; - struct xfs_mount *mp = log->l_mp; - struct xfs_efi_log_item *efip; - struct xfs_efi_log_format *efi_formatp; - - efi_formatp = item->ri_buf[0].i_addr; - - efip = xfs_efi_init(mp, efi_formatp->efi_nextents); - error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format); - if (error) { - xfs_efi_item_free(efip); - return error; - } - atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); - - spin_lock(&log->l_ailp->ail_lock); - /* - * The EFI has two references. One for the EFD and one for EFI to ensure - * it makes it into the AIL. Insert the EFI into the AIL directly and - * drop the EFI reference. Note that xfs_trans_ail_update() drops the - * AIL lock. - */ - xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn); - xfs_efi_release(efip); - return 0; -} - - -/* - * This routine is called when an EFD format structure is found in a committed - * transaction in the log. Its purpose is to cancel the corresponding EFI if it - * was still in the log. To do this it searches the AIL for the EFI with an id - * equal to that in the EFD format structure. If we find it we drop the EFD - * reference, which removes the EFI from the AIL and frees it. - */ -STATIC int -xlog_recover_efd_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - xfs_efd_log_format_t *efd_formatp; - xfs_efi_log_item_t *efip = NULL; - struct xfs_log_item *lip; - uint64_t efi_id; - struct xfs_ail_cursor cur; - struct xfs_ail *ailp = log->l_ailp; - - efd_formatp = item->ri_buf[0].i_addr; - ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + - ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || - (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + - ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t))))); - efi_id = efd_formatp->efd_efi_id; - - /* - * Search for the EFI with the id in the EFD format structure in the - * AIL. - */ - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - while (lip != NULL) { - if (lip->li_type == XFS_LI_EFI) { - efip = (xfs_efi_log_item_t *)lip; - if (efip->efi_format.efi_id == efi_id) { - /* - * Drop the EFD reference to the EFI. This - * removes the EFI from the AIL and frees it. - */ - spin_unlock(&ailp->ail_lock); - xfs_efi_release(efip); - spin_lock(&ailp->ail_lock); - break; - } - } - lip = xfs_trans_ail_cursor_next(ailp, &cur); - } - - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); - - return 0; -} - -/* - * This routine is called to create an in-core extent rmap update - * item from the rui format structure which was logged on disk. - * It allocates an in-core rui, copies the extents from the format - * structure into it, and adds the rui to the AIL with the given - * LSN. - */ -STATIC int -xlog_recover_rui_pass2( - struct xlog *log, - struct xlog_recover_item *item, - xfs_lsn_t lsn) -{ - int error; - struct xfs_mount *mp = log->l_mp; - struct xfs_rui_log_item *ruip; - struct xfs_rui_log_format *rui_formatp; - - rui_formatp = item->ri_buf[0].i_addr; - - ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); - error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format); - if (error) { - xfs_rui_item_free(ruip); - return error; - } - atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); - - spin_lock(&log->l_ailp->ail_lock); - /* - * The RUI has two references. One for the RUD and one for RUI to ensure - * it makes it into the AIL. Insert the RUI into the AIL directly and - * drop the RUI reference. Note that xfs_trans_ail_update() drops the - * AIL lock. - */ - xfs_trans_ail_update(log->l_ailp, &ruip->rui_item, lsn); - xfs_rui_release(ruip); - return 0; -} - - -/* - * This routine is called when an RUD format structure is found in a committed - * transaction in the log. Its purpose is to cancel the corresponding RUI if it - * was still in the log. To do this it searches the AIL for the RUI with an id - * equal to that in the RUD format structure. If we find it we drop the RUD - * reference, which removes the RUI from the AIL and frees it. - */ -STATIC int -xlog_recover_rud_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - struct xfs_rud_log_format *rud_formatp; - struct xfs_rui_log_item *ruip = NULL; - struct xfs_log_item *lip; - uint64_t rui_id; - struct xfs_ail_cursor cur; - struct xfs_ail *ailp = log->l_ailp; - - rud_formatp = item->ri_buf[0].i_addr; - ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format)); - rui_id = rud_formatp->rud_rui_id; - - /* - * Search for the RUI with the id in the RUD format structure in the - * AIL. - */ - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - while (lip != NULL) { - if (lip->li_type == XFS_LI_RUI) { - ruip = (struct xfs_rui_log_item *)lip; - if (ruip->rui_format.rui_id == rui_id) { - /* - * Drop the RUD reference to the RUI. This - * removes the RUI from the AIL and frees it. - */ - spin_unlock(&ailp->ail_lock); - xfs_rui_release(ruip); - spin_lock(&ailp->ail_lock); - break; - } - } - lip = xfs_trans_ail_cursor_next(ailp, &cur); - } - - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); - - return 0; -} - -/* - * Copy an CUI format buffer from the given buf, and into the destination - * CUI format structure. The CUI/CUD items were designed not to need any - * special alignment handling. - */ -static int -xfs_cui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_cui_log_format *dst_cui_fmt) -{ - struct xfs_cui_log_format *src_cui_fmt; - uint len; - - src_cui_fmt = buf->i_addr; - len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents); - - if (buf->i_len == len) { - memcpy(dst_cui_fmt, src_cui_fmt, len); - return 0; - } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; -} - -/* - * This routine is called to create an in-core extent refcount update - * item from the cui format structure which was logged on disk. - * It allocates an in-core cui, copies the extents from the format - * structure into it, and adds the cui to the AIL with the given - * LSN. - */ -STATIC int -xlog_recover_cui_pass2( - struct xlog *log, - struct xlog_recover_item *item, - xfs_lsn_t lsn) -{ - int error; - struct xfs_mount *mp = log->l_mp; - struct xfs_cui_log_item *cuip; - struct xfs_cui_log_format *cui_formatp; - - cui_formatp = item->ri_buf[0].i_addr; - - cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); - error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format); - if (error) { - xfs_cui_item_free(cuip); - return error; - } - atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); - - spin_lock(&log->l_ailp->ail_lock); - /* - * The CUI has two references. One for the CUD and one for CUI to ensure - * it makes it into the AIL. Insert the CUI into the AIL directly and - * drop the CUI reference. Note that xfs_trans_ail_update() drops the - * AIL lock. - */ - xfs_trans_ail_update(log->l_ailp, &cuip->cui_item, lsn); - xfs_cui_release(cuip); - return 0; -} - - -/* - * This routine is called when an CUD format structure is found in a committed - * transaction in the log. Its purpose is to cancel the corresponding CUI if it - * was still in the log. To do this it searches the AIL for the CUI with an id - * equal to that in the CUD format structure. If we find it we drop the CUD - * reference, which removes the CUI from the AIL and frees it. - */ -STATIC int -xlog_recover_cud_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - struct xfs_cud_log_format *cud_formatp; - struct xfs_cui_log_item *cuip = NULL; - struct xfs_log_item *lip; - uint64_t cui_id; - struct xfs_ail_cursor cur; - struct xfs_ail *ailp = log->l_ailp; - - cud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); - return -EFSCORRUPTED; - } - cui_id = cud_formatp->cud_cui_id; - - /* - * Search for the CUI with the id in the CUD format structure in the - * AIL. - */ - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - while (lip != NULL) { - if (lip->li_type == XFS_LI_CUI) { - cuip = (struct xfs_cui_log_item *)lip; - if (cuip->cui_format.cui_id == cui_id) { - /* - * Drop the CUD reference to the CUI. This - * removes the CUI from the AIL and frees it. - */ - spin_unlock(&ailp->ail_lock); - xfs_cui_release(cuip); - spin_lock(&ailp->ail_lock); - break; - } - } - lip = xfs_trans_ail_cursor_next(ailp, &cur); - } - - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); - - return 0; -} - -/* - * Copy an BUI format buffer from the given buf, and into the destination - * BUI format structure. The BUI/BUD items were designed not to need any - * special alignment handling. - */ -static int -xfs_bui_copy_format( - struct xfs_log_iovec *buf, - struct xfs_bui_log_format *dst_bui_fmt) -{ - struct xfs_bui_log_format *src_bui_fmt; - uint len; - - src_bui_fmt = buf->i_addr; - len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents); - - if (buf->i_len == len) { - memcpy(dst_bui_fmt, src_bui_fmt, len); - return 0; - } - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); - return -EFSCORRUPTED; -} - -/* - * This routine is called to create an in-core extent bmap update - * item from the bui format structure which was logged on disk. - * It allocates an in-core bui, copies the extents from the format - * structure into it, and adds the bui to the AIL with the given - * LSN. - */ -STATIC int -xlog_recover_bui_pass2( - struct xlog *log, - struct xlog_recover_item *item, - xfs_lsn_t lsn) -{ - int error; - struct xfs_mount *mp = log->l_mp; - struct xfs_bui_log_item *buip; - struct xfs_bui_log_format *bui_formatp; - - bui_formatp = item->ri_buf[0].i_addr; - - if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); - return -EFSCORRUPTED; - } - buip = xfs_bui_init(mp); - error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format); - if (error) { - xfs_bui_item_free(buip); - return error; - } - atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); - - spin_lock(&log->l_ailp->ail_lock); - /* - * The RUI has two references. One for the RUD and one for RUI to ensure - * it makes it into the AIL. Insert the RUI into the AIL directly and - * drop the RUI reference. Note that xfs_trans_ail_update() drops the - * AIL lock. - */ - xfs_trans_ail_update(log->l_ailp, &buip->bui_item, lsn); - xfs_bui_release(buip); - return 0; -} - - -/* - * This routine is called when an BUD format structure is found in a committed - * transaction in the log. Its purpose is to cancel the corresponding BUI if it - * was still in the log. To do this it searches the AIL for the BUI with an id - * equal to that in the BUD format structure. If we find it we drop the BUD - * reference, which removes the BUI from the AIL and frees it. - */ -STATIC int -xlog_recover_bud_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - struct xfs_bud_log_format *bud_formatp; - struct xfs_bui_log_item *buip = NULL; - struct xfs_log_item *lip; - uint64_t bui_id; - struct xfs_ail_cursor cur; - struct xfs_ail *ailp = log->l_ailp; - - bud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); - return -EFSCORRUPTED; - } - bui_id = bud_formatp->bud_bui_id; - - /* - * Search for the BUI with the id in the BUD format structure in the - * AIL. - */ - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - while (lip != NULL) { - if (lip->li_type == XFS_LI_BUI) { - buip = (struct xfs_bui_log_item *)lip; - if (buip->bui_format.bui_id == bui_id) { - /* - * Drop the BUD reference to the BUI. This - * removes the BUI from the AIL and frees it. - */ - spin_unlock(&ailp->ail_lock); - xfs_bui_release(buip); - spin_lock(&ailp->ail_lock); - break; - } - } - lip = xfs_trans_ail_cursor_next(ailp, &cur); - } - - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); - - return 0; -} - -/* - * This routine is called when an inode create format structure is found in a - * committed transaction in the log. It's purpose is to initialise the inodes - * being allocated on disk. This requires us to get inode cluster buffers that - * match the range to be initialised, stamped with inode templates and written - * by delayed write so that subsequent modifications will hit the cached buffer - * and only need writing out at the end of recovery. - */ -STATIC int -xlog_recover_do_icreate_pass2( - struct xlog *log, - struct list_head *buffer_list, - xlog_recover_item_t *item) -{ - struct xfs_mount *mp = log->l_mp; - struct xfs_icreate_log *icl; - struct xfs_ino_geometry *igeo = M_IGEO(mp); - xfs_agnumber_t agno; - xfs_agblock_t agbno; - unsigned int count; - unsigned int isize; - xfs_agblock_t length; - int bb_per_cluster; - int cancel_count; - int nbufs; - int i; - - icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; - if (icl->icl_type != XFS_LI_ICREATE) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); - return -EINVAL; - } - - if (icl->icl_size != 1) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); - return -EINVAL; - } - - agno = be32_to_cpu(icl->icl_ag); - if (agno >= mp->m_sb.sb_agcount) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); - return -EINVAL; - } - agbno = be32_to_cpu(icl->icl_agbno); - if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); - return -EINVAL; - } - isize = be32_to_cpu(icl->icl_isize); - if (isize != mp->m_sb.sb_inodesize) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); - return -EINVAL; - } - count = be32_to_cpu(icl->icl_count); - if (!count) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); - return -EINVAL; - } - length = be32_to_cpu(icl->icl_length); - if (!length || length >= mp->m_sb.sb_agblocks) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); - return -EINVAL; - } - - /* - * The inode chunk is either full or sparse and we only support - * m_ino_geo.ialloc_min_blks sized sparse allocations at this time. - */ - if (length != igeo->ialloc_blks && - length != igeo->ialloc_min_blks) { - xfs_warn(log->l_mp, - "%s: unsupported chunk length", __FUNCTION__); - return -EINVAL; - } - - /* verify inode count is consistent with extent length */ - if ((count >> mp->m_sb.sb_inopblog) != length) { - xfs_warn(log->l_mp, - "%s: inconsistent inode count and chunk length", - __FUNCTION__); - return -EINVAL; - } - - /* - * The icreate transaction can cover multiple cluster buffers and these - * buffers could have been freed and reused. Check the individual - * buffers for cancellation so we don't overwrite anything written after - * a cancellation. - */ - bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); - nbufs = length / igeo->blocks_per_cluster; - for (i = 0, cancel_count = 0; i < nbufs; i++) { - xfs_daddr_t daddr; - - daddr = XFS_AGB_TO_DADDR(mp, agno, - agbno + i * igeo->blocks_per_cluster); - if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0)) - cancel_count++; - } - - /* - * We currently only use icreate for a single allocation at a time. This - * means we should expect either all or none of the buffers to be - * cancelled. Be conservative and skip replay if at least one buffer is - * cancelled, but warn the user that something is awry if the buffers - * are not consistent. - * - * XXX: This must be refined to only skip cancelled clusters once we use - * icreate for multiple chunk allocations. - */ - ASSERT(!cancel_count || cancel_count == nbufs); - if (cancel_count) { - if (cancel_count != nbufs) - xfs_warn(mp, - "WARNING: partial inode chunk cancellation, skipped icreate."); - trace_xfs_log_recover_icreate_cancel(log, icl); - return 0; - } - - trace_xfs_log_recover_icreate_recover(log, icl); - return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, - length, be32_to_cpu(icl->icl_gen)); -} - -STATIC void -xlog_recover_buffer_ra_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; - struct xfs_mount *mp = log->l_mp; - - if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno, - buf_f->blf_len, buf_f->blf_flags)) { - return; - } - - xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno, - buf_f->blf_len, NULL); -} - -STATIC void -xlog_recover_inode_ra_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - struct xfs_inode_log_format ilf_buf; - struct xfs_inode_log_format *ilfp; - struct xfs_mount *mp = log->l_mp; - int error; - - if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { - ilfp = item->ri_buf[0].i_addr; - } else { - ilfp = &ilf_buf; - memset(ilfp, 0, sizeof(*ilfp)); - error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp); - if (error) - return; - } - - if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0)) - return; - - xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno, - ilfp->ilf_len, &xfs_inode_buf_ra_ops); -} - -STATIC void -xlog_recover_dquot_ra_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - struct xfs_mount *mp = log->l_mp; - struct xfs_disk_dquot *recddq; - struct xfs_dq_logformat *dq_f; - uint type; - int len; - - - if (mp->m_qflags == 0) - return; - - recddq = item->ri_buf[1].i_addr; - if (recddq == NULL) - return; - if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) - return; - - type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); - ASSERT(type); - if (log->l_quotaoffs_flag & type) - return; - - dq_f = item->ri_buf[0].i_addr; - ASSERT(dq_f); - ASSERT(dq_f->qlf_len == 1); - - len = XFS_FSB_TO_BB(mp, dq_f->qlf_len); - if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0)) - return; - - xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len, - &xfs_dquot_buf_ra_ops); -} - -STATIC void -xlog_recover_ra_pass2( - struct xlog *log, - struct xlog_recover_item *item) -{ - switch (ITEM_TYPE(item)) { - case XFS_LI_BUF: - xlog_recover_buffer_ra_pass2(log, item); - break; - case XFS_LI_INODE: - xlog_recover_inode_ra_pass2(log, item); - break; - case XFS_LI_DQUOT: - xlog_recover_dquot_ra_pass2(log, item); - break; - case XFS_LI_EFI: - case XFS_LI_EFD: - case XFS_LI_QUOTAOFF: - case XFS_LI_RUI: - case XFS_LI_RUD: - case XFS_LI_CUI: - case XFS_LI_CUD: - case XFS_LI_BUI: - case XFS_LI_BUD: - default: - break; - } -} - -STATIC int -xlog_recover_commit_pass1( - struct xlog *log, - struct xlog_recover *trans, - struct xlog_recover_item *item) -{ - trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1); - - switch (ITEM_TYPE(item)) { - case XFS_LI_BUF: - return xlog_recover_buffer_pass1(log, item); - case XFS_LI_QUOTAOFF: - return xlog_recover_quotaoff_pass1(log, item); - case XFS_LI_INODE: - case XFS_LI_EFI: - case XFS_LI_EFD: - case XFS_LI_DQUOT: - case XFS_LI_ICREATE: - case XFS_LI_RUI: - case XFS_LI_RUD: - case XFS_LI_CUI: - case XFS_LI_CUD: - case XFS_LI_BUI: - case XFS_LI_BUD: - /* nothing to do in pass 1 */ - return 0; - default: - xfs_warn(log->l_mp, "%s: invalid item type (%d)", - __func__, ITEM_TYPE(item)); - ASSERT(0); - return -EFSCORRUPTED; - } -} - -STATIC int -xlog_recover_commit_pass2( - struct xlog *log, - struct xlog_recover *trans, - struct list_head *buffer_list, - struct xlog_recover_item *item) -{ - trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); - - switch (ITEM_TYPE(item)) { - case XFS_LI_BUF: - return xlog_recover_buffer_pass2(log, buffer_list, item, - trans->r_lsn); - case XFS_LI_INODE: - return xlog_recover_inode_pass2(log, buffer_list, item, - trans->r_lsn); - case XFS_LI_EFI: - return xlog_recover_efi_pass2(log, item, trans->r_lsn); - case XFS_LI_EFD: - return xlog_recover_efd_pass2(log, item); - case XFS_LI_RUI: - return xlog_recover_rui_pass2(log, item, trans->r_lsn); - case XFS_LI_RUD: - return xlog_recover_rud_pass2(log, item); - case XFS_LI_CUI: - return xlog_recover_cui_pass2(log, item, trans->r_lsn); - case XFS_LI_CUD: - return xlog_recover_cud_pass2(log, item); - case XFS_LI_BUI: - return xlog_recover_bui_pass2(log, item, trans->r_lsn); - case XFS_LI_BUD: - return xlog_recover_bud_pass2(log, item); - case XFS_LI_DQUOT: - return xlog_recover_dquot_pass2(log, buffer_list, item, - trans->r_lsn); - case XFS_LI_ICREATE: - return xlog_recover_do_icreate_pass2(log, buffer_list, item); - case XFS_LI_QUOTAOFF: - /* nothing to do in pass2 */ - return 0; - default: - xfs_warn(log->l_mp, "%s: invalid item type (%d)", - __func__, ITEM_TYPE(item)); - ASSERT(0); - return -EFSCORRUPTED; - } + if (!xlog_is_buffer_cancelled(log, blkno, len)) + xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops); } STATIC int @@ -4072,8 +1967,12 @@ xlog_recover_items_pass2( int error = 0; list_for_each_entry(item, item_list, ri_list) { - error = xlog_recover_commit_pass2(log, trans, - buffer_list, item); + trace_xfs_log_recover_item_recover(log, trans, item, + XLOG_RECOVER_PASS2); + + if (item->ri_ops->commit_pass2) + error = item->ri_ops->commit_pass2(log, buffer_list, + item, trans->r_lsn); if (error) return error; } @@ -4110,12 +2009,16 @@ xlog_recover_commit_trans( return error; list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) { + trace_xfs_log_recover_item_recover(log, trans, item, pass); + switch (pass) { case XLOG_RECOVER_PASS1: - error = xlog_recover_commit_pass1(log, trans, item); + if (item->ri_ops->commit_pass1) + error = item->ri_ops->commit_pass1(log, item); break; case XLOG_RECOVER_PASS2: - xlog_recover_ra_pass2(log, item); + if (item->ri_ops->ra_pass2) + item->ri_ops->ra_pass2(log, item); list_move_tail(&item->ri_list, &ra_list); items_queued++; if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) { @@ -4152,9 +2055,9 @@ STATIC void xlog_recover_add_item( struct list_head *head) { - xlog_recover_item_t *item; + struct xlog_recover_item *item; - item = kmem_zalloc(sizeof(xlog_recover_item_t), 0); + item = kmem_zalloc(sizeof(struct xlog_recover_item), 0); INIT_LIST_HEAD(&item->ri_list); list_add_tail(&item->ri_list, head); } @@ -4166,7 +2069,7 @@ xlog_recover_add_to_cont_trans( char *dp, int len) { - xlog_recover_item_t *item; + struct xlog_recover_item *item; char *ptr, *old_ptr; int old_len; @@ -4189,7 +2092,8 @@ xlog_recover_add_to_cont_trans( } /* take the tail entry */ - item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + item = list_entry(trans->r_itemq.prev, struct xlog_recover_item, + ri_list); old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; @@ -4223,7 +2127,7 @@ xlog_recover_add_to_trans( int len) { struct xfs_inode_log_format *in_f; /* any will do */ - xlog_recover_item_t *item; + struct xlog_recover_item *item; char *ptr; if (!len) @@ -4259,13 +2163,14 @@ xlog_recover_add_to_trans( in_f = (struct xfs_inode_log_format *)ptr; /* take the tail entry */ - item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + item = list_entry(trans->r_itemq.prev, struct xlog_recover_item, + ri_list); if (item->ri_total != 0 && item->ri_total == item->ri_cnt) { /* tail item is in use, get a new one */ xlog_recover_add_item(&trans->r_itemq); item = list_entry(trans->r_itemq.prev, - xlog_recover_item_t, ri_list); + struct xlog_recover_item, ri_list); } if (item->ri_total == 0) { /* first region to be added */ @@ -4311,7 +2216,7 @@ STATIC void xlog_recover_free_trans( struct xlog_recover *trans) { - xlog_recover_item_t *item, *n; + struct xlog_recover_item *item, *n; int i; hlist_del_init(&trans->r_list); @@ -4563,180 +2468,6 @@ xlog_recover_process_data( return 0; } -/* Recover the EFI if necessary. */ -STATIC int -xlog_recover_process_efi( - struct xfs_mount *mp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_efi_log_item *efip; - int error; - - /* - * Skip EFIs that we've already processed. - */ - efip = container_of(lip, struct xfs_efi_log_item, efi_item); - if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) - return 0; - - spin_unlock(&ailp->ail_lock); - error = xfs_efi_recover(mp, efip); - spin_lock(&ailp->ail_lock); - - return error; -} - -/* Release the EFI since we're cancelling everything. */ -STATIC void -xlog_recover_cancel_efi( - struct xfs_mount *mp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_efi_log_item *efip; - - efip = container_of(lip, struct xfs_efi_log_item, efi_item); - - spin_unlock(&ailp->ail_lock); - xfs_efi_release(efip); - spin_lock(&ailp->ail_lock); -} - -/* Recover the RUI if necessary. */ -STATIC int -xlog_recover_process_rui( - struct xfs_mount *mp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_rui_log_item *ruip; - int error; - - /* - * Skip RUIs that we've already processed. - */ - ruip = container_of(lip, struct xfs_rui_log_item, rui_item); - if (test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags)) - return 0; - - spin_unlock(&ailp->ail_lock); - error = xfs_rui_recover(mp, ruip); - spin_lock(&ailp->ail_lock); - - return error; -} - -/* Release the RUI since we're cancelling everything. */ -STATIC void -xlog_recover_cancel_rui( - struct xfs_mount *mp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_rui_log_item *ruip; - - ruip = container_of(lip, struct xfs_rui_log_item, rui_item); - - spin_unlock(&ailp->ail_lock); - xfs_rui_release(ruip); - spin_lock(&ailp->ail_lock); -} - -/* Recover the CUI if necessary. */ -STATIC int -xlog_recover_process_cui( - struct xfs_trans *parent_tp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_cui_log_item *cuip; - int error; - - /* - * Skip CUIs that we've already processed. - */ - cuip = container_of(lip, struct xfs_cui_log_item, cui_item); - if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)) - return 0; - - spin_unlock(&ailp->ail_lock); - error = xfs_cui_recover(parent_tp, cuip); - spin_lock(&ailp->ail_lock); - - return error; -} - -/* Release the CUI since we're cancelling everything. */ -STATIC void -xlog_recover_cancel_cui( - struct xfs_mount *mp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_cui_log_item *cuip; - - cuip = container_of(lip, struct xfs_cui_log_item, cui_item); - - spin_unlock(&ailp->ail_lock); - xfs_cui_release(cuip); - spin_lock(&ailp->ail_lock); -} - -/* Recover the BUI if necessary. */ -STATIC int -xlog_recover_process_bui( - struct xfs_trans *parent_tp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_bui_log_item *buip; - int error; - - /* - * Skip BUIs that we've already processed. - */ - buip = container_of(lip, struct xfs_bui_log_item, bui_item); - if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)) - return 0; - - spin_unlock(&ailp->ail_lock); - error = xfs_bui_recover(parent_tp, buip); - spin_lock(&ailp->ail_lock); - - return error; -} - -/* Release the BUI since we're cancelling everything. */ -STATIC void -xlog_recover_cancel_bui( - struct xfs_mount *mp, - struct xfs_ail *ailp, - struct xfs_log_item *lip) -{ - struct xfs_bui_log_item *buip; - - buip = container_of(lip, struct xfs_bui_log_item, bui_item); - - spin_unlock(&ailp->ail_lock); - xfs_bui_release(buip); - spin_lock(&ailp->ail_lock); -} - -/* Is this log item a deferred action intent? */ -static inline bool xlog_item_is_intent(struct xfs_log_item *lip) -{ - switch (lip->li_type) { - case XFS_LI_EFI: - case XFS_LI_RUI: - case XFS_LI_CUI: - case XFS_LI_BUI: - return true; - default: - return false; - } -} - /* Take all the collected deferred ops and finish them in order. */ static int xlog_finish_defer_ops( @@ -4771,6 +2502,13 @@ xlog_finish_defer_ops( return xfs_trans_commit(tp); } +/* Is this log item a deferred action intent? */ +static inline bool xlog_item_is_intent(struct xfs_log_item *lip) +{ + return lip->li_ops->iop_recover != NULL && + lip->li_ops->iop_match != NULL; +} + /* * When this is called, all of the log intent items which did not have * corresponding log done items should be in the AIL. What we do now @@ -4841,23 +2579,14 @@ xlog_recover_process_intents( /* * NOTE: If your intent processing routine can create more - * deferred ops, you /must/ attach them to the dfops in this - * routine or else those subsequent intents will get + * deferred ops, you /must/ attach them to the transaction in + * this routine or else those subsequent intents will get * replayed in the wrong order! */ - switch (lip->li_type) { - case XFS_LI_EFI: - error = xlog_recover_process_efi(log->l_mp, ailp, lip); - break; - case XFS_LI_RUI: - error = xlog_recover_process_rui(log->l_mp, ailp, lip); - break; - case XFS_LI_CUI: - error = xlog_recover_process_cui(parent_tp, ailp, lip); - break; - case XFS_LI_BUI: - error = xlog_recover_process_bui(parent_tp, ailp, lip); - break; + if (!test_and_set_bit(XFS_LI_RECOVERED, &lip->li_flags)) { + spin_unlock(&ailp->ail_lock); + error = lip->li_ops->iop_recover(lip, parent_tp); + spin_lock(&ailp->ail_lock); } if (error) goto out; @@ -4901,21 +2630,9 @@ xlog_recover_cancel_intents( break; } - switch (lip->li_type) { - case XFS_LI_EFI: - xlog_recover_cancel_efi(log->l_mp, ailp, lip); - break; - case XFS_LI_RUI: - xlog_recover_cancel_rui(log->l_mp, ailp, lip); - break; - case XFS_LI_CUI: - xlog_recover_cancel_cui(log->l_mp, ailp, lip); - break; - case XFS_LI_BUI: - xlog_recover_cancel_bui(log->l_mp, ailp, lip); - break; - } - + spin_unlock(&ailp->ail_lock); + lip->li_ops->iop_release(lip); + spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_next(ailp, &cur); } @@ -4987,7 +2704,7 @@ xlog_recover_process_one_iunlink( /* * Get the on disk inode to find the next inode in the bucket. */ - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0); + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0); if (error) goto fail_iput; diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index e0f9d3b6abe9..bc66d95c8d4c 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -117,3 +117,25 @@ xfs_hex_dump(const void *p, int length) { print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1); } + +void +xfs_buf_alert_ratelimited( + struct xfs_buf *bp, + const char *rlmsg, + const char *fmt, + ...) +{ + struct xfs_mount *mp = bp->b_mount; + struct va_format vaf; + va_list args; + + /* use the more aggressive per-target rate limit for buffers */ + if (!___ratelimit(&bp->b_target->bt_ioerror_rl, rlmsg)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + __xfs_printk(KERN_ALERT, mp, &vaf); + va_end(args); +} diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 0b05e10995a0..4d9bd6bb63ca 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -31,15 +31,27 @@ void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) } #endif -#define xfs_printk_ratelimited(func, dev, fmt, ...) \ +#define xfs_printk_ratelimited(func, dev, fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ if (__ratelimit(&_rs)) \ - func(dev, fmt, ##__VA_ARGS__); \ + func(dev, fmt, ##__VA_ARGS__); \ } while (0) +#define xfs_printk_once(func, dev, fmt, ...) \ +({ \ + static bool __section(.data.once) __print_once; \ + bool __ret_print_once = !__print_once; \ + \ + if (!__print_once) { \ + __print_once = true; \ + func(dev, fmt, ##__VA_ARGS__); \ + } \ + unlikely(__ret_print_once); \ +}) + #define xfs_emerg_ratelimited(dev, fmt, ...) \ xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__) #define xfs_alert_ratelimited(dev, fmt, ...) \ @@ -57,9 +69,17 @@ do { \ #define xfs_debug_ratelimited(dev, fmt, ...) \ xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) +#define xfs_warn_once(dev, fmt, ...) \ + xfs_printk_once(xfs_warn, dev, fmt, ##__VA_ARGS__) +#define xfs_notice_once(dev, fmt, ...) \ + xfs_printk_once(xfs_notice, dev, fmt, ##__VA_ARGS__) + void assfail(struct xfs_mount *mp, char *expr, char *f, int l); void asswarn(struct xfs_mount *mp, char *expr, char *f, int l); extern void xfs_hex_dump(const void *p, int length); +void xfs_buf_alert_ratelimited(struct xfs_buf *bp, const char *rlmsg, + const char *fmt, ...); + #endif /* __XFS_MESSAGE_H */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c5513e5a226a..d5dcf9869860 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1190,39 +1190,6 @@ xfs_log_sbcount(xfs_mount_t *mp) } /* - * Deltas for the inode count are +/-64, hence we use a large batch size - * of 128 so we don't need to take the counter lock on every update. - */ -#define XFS_ICOUNT_BATCH 128 -int -xfs_mod_icount( - struct xfs_mount *mp, - int64_t delta) -{ - percpu_counter_add_batch(&mp->m_icount, delta, XFS_ICOUNT_BATCH); - if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) { - ASSERT(0); - percpu_counter_add(&mp->m_icount, -delta); - return -EINVAL; - } - return 0; -} - -int -xfs_mod_ifree( - struct xfs_mount *mp, - int64_t delta) -{ - percpu_counter_add(&mp->m_ifree, delta); - if (percpu_counter_compare(&mp->m_ifree, 0) < 0) { - ASSERT(0); - percpu_counter_add(&mp->m_ifree, -delta); - return -EINVAL; - } - return 0; -} - -/* * Deltas for the block count can vary from 1 to very large, but lock contention * only occurs on frequent small block count updates such as in the delayed * allocation path for buffered writes (page a time updates). Hence we set @@ -1300,10 +1267,9 @@ xfs_mod_fdblocks( spin_unlock(&mp->m_sb_lock); return 0; } - printk_once(KERN_WARNING - "Filesystem \"%s\": reserve blocks depleted! " - "Consider increasing reserve pool size.", - mp->m_super->s_id); + xfs_warn_once(mp, +"Reserve blocks depleted! Consider increasing reserve pool size."); + fdblocks_enospc: spin_unlock(&mp->m_sb_lock); return -ENOSPC; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b2e4598fdf7d..3725d25ad97e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -55,61 +55,25 @@ struct xfs_error_cfg { long retry_timeout; /* in jiffies, -1 = infinite */ }; +/* + * The struct xfsmount layout is optimised to separate read-mostly variables + * from variables that are frequently modified. We put the read-mostly variables + * first, then place all the other variables at the end. + * + * Typically, read-mostly variables are those that are set at mount time and + * never changed again, or only change rarely as a result of things like sysfs + * knobs being tweaked. + */ typedef struct xfs_mount { + struct xfs_sb m_sb; /* copy of fs superblock */ struct super_block *m_super; - - /* - * Bitsets of per-fs metadata that have been checked and/or are sick. - * Callers must hold m_sb_lock to access these two fields. - */ - uint8_t m_fs_checked; - uint8_t m_fs_sick; - /* - * Bitsets of rt metadata that have been checked and/or are sick. - * Callers must hold m_sb_lock to access this field. - */ - uint8_t m_rt_checked; - uint8_t m_rt_sick; - struct xfs_ail *m_ail; /* fs active log item list */ - - struct xfs_sb m_sb; /* copy of fs superblock */ - spinlock_t m_sb_lock; /* sb counter lock */ - struct percpu_counter m_icount; /* allocated inodes counter */ - struct percpu_counter m_ifree; /* free inodes counter */ - struct percpu_counter m_fdblocks; /* free block counter */ - /* - * Count of data device blocks reserved for delayed allocations, - * including indlen blocks. Does not include allocated CoW staging - * extents or anything related to the rt device. - */ - struct percpu_counter m_delalloc_blks; - struct xfs_buf *m_sb_bp; /* buffer for superblock */ char *m_rtname; /* realtime device name */ char *m_logname; /* external log device name */ - int m_bsize; /* fs logical block size */ - xfs_agnumber_t m_agfrotor; /* last ag where space found */ - xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ - spinlock_t m_agirotor_lock;/* .. and lock protecting it */ - xfs_agnumber_t m_maxagi; /* highest inode alloc group */ - uint m_allocsize_log;/* min write size log bytes */ - uint m_allocsize_blocks; /* min write size blocks */ struct xfs_da_geometry *m_dir_geo; /* directory block geometry */ struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */ struct xlog *m_log; /* log specific stuff */ - struct xfs_ino_geometry m_ino_geo; /* inode geometry */ - int m_logbufs; /* number of log buffers */ - int m_logbsize; /* size of each log buffer */ - uint m_rsumlevels; /* rt summary levels */ - uint m_rsumsize; /* size of rt summary, bytes */ - /* - * Optional cache of rt summary level per bitmap block with the - * invariant that m_rsum_cache[bbno] <= the minimum i for which - * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip - * inode lock. - */ - uint8_t *m_rsum_cache; struct xfs_inode *m_rbmip; /* pointer to bitmap inode */ struct xfs_inode *m_rsumip; /* pointer to summary inode */ struct xfs_inode *m_rootip; /* pointer to root directory */ @@ -117,9 +81,26 @@ typedef struct xfs_mount { xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ + /* + * Optional cache of rt summary level per bitmap block with the + * invariant that m_rsum_cache[bbno] <= the minimum i for which + * rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip + * inode lock. + */ + uint8_t *m_rsum_cache; + struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ + struct workqueue_struct *m_buf_workqueue; + struct workqueue_struct *m_unwritten_workqueue; + struct workqueue_struct *m_cil_workqueue; + struct workqueue_struct *m_reclaim_workqueue; + struct workqueue_struct *m_eofblocks_workqueue; + struct workqueue_struct *m_sync_workqueue; + + int m_bsize; /* fs logical block size */ uint8_t m_blkbit_log; /* blocklog + NBBY */ uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ uint8_t m_agno_log; /* log #ag's */ + uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ uint m_blockwmask; /* blockwsize-1 */ @@ -138,47 +119,82 @@ typedef struct xfs_mount { xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */ uint m_alloc_set_aside; /* space we can't use */ uint m_ag_max_usable; /* max space per AG */ - struct radix_tree_root m_perag_tree; /* per-ag accounting info */ - spinlock_t m_perag_lock; /* lock for m_perag_tree */ - struct mutex m_growlock; /* growfs mutex */ + int m_dalign; /* stripe unit */ + int m_swidth; /* stripe width */ + xfs_agnumber_t m_maxagi; /* highest inode alloc group */ + uint m_allocsize_log;/* min write size log bytes */ + uint m_allocsize_blocks; /* min write size blocks */ + int m_logbufs; /* number of log buffers */ + int m_logbsize; /* size of each log buffer */ + uint m_rsumlevels; /* rt summary levels */ + uint m_rsumsize; /* size of rt summary, bytes */ int m_fixedfsid[2]; /* unchanged for life of FS */ - uint64_t m_flags; /* global mount flags */ - bool m_finobt_nores; /* no per-AG finobt resv. */ uint m_qflags; /* quota status flags */ + uint64_t m_flags; /* global mount flags */ + int64_t m_low_space[XFS_LOWSP_MAX]; + struct xfs_ino_geometry m_ino_geo; /* inode geometry */ struct xfs_trans_resv m_resv; /* precomputed res values */ + /* low free space thresholds */ + bool m_always_cow; + bool m_fail_unmount; + bool m_finobt_nores; /* no per-AG finobt resv. */ + bool m_update_sb; /* sb needs update in mount */ + + /* + * Bitsets of per-fs metadata that have been checked and/or are sick. + * Callers must hold m_sb_lock to access these two fields. + */ + uint8_t m_fs_checked; + uint8_t m_fs_sick; + /* + * Bitsets of rt metadata that have been checked and/or are sick. + * Callers must hold m_sb_lock to access this field. + */ + uint8_t m_rt_checked; + uint8_t m_rt_sick; + + /* + * End of read-mostly variables. Frequently written variables and locks + * should be placed below this comment from now on. The first variable + * here is marked as cacheline aligned so they it is separated from + * the read-mostly variables. + */ + + spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */ + struct percpu_counter m_icount; /* allocated inodes counter */ + struct percpu_counter m_ifree; /* free inodes counter */ + struct percpu_counter m_fdblocks; /* free block counter */ + /* + * Count of data device blocks reserved for delayed allocations, + * including indlen blocks. Does not include allocated CoW staging + * extents or anything related to the rt device. + */ + struct percpu_counter m_delalloc_blks; + + struct radix_tree_root m_perag_tree; /* per-ag accounting info */ + spinlock_t m_perag_lock; /* lock for m_perag_tree */ uint64_t m_resblks; /* total reserved blocks */ uint64_t m_resblks_avail;/* available reserved blocks */ uint64_t m_resblks_save; /* reserved blks @ remount,ro */ - int m_dalign; /* stripe unit */ - int m_swidth; /* stripe width */ - uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ - atomic_t m_active_trans; /* number trans frozen */ - struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct delayed_work m_reclaim_work; /* background inode reclaim */ struct delayed_work m_eofblocks_work; /* background eof blocks trimming */ struct delayed_work m_cowblocks_work; /* background cow blocks trimming */ - bool m_update_sb; /* sb needs update in mount */ - int64_t m_low_space[XFS_LOWSP_MAX]; - /* low free space thresholds */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; struct xfs_kobj m_error_meta_kobj; struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; struct xstats m_stats; /* per-fs stats */ + xfs_agnumber_t m_agfrotor; /* last ag where space found */ + xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ + spinlock_t m_agirotor_lock;/* .. and lock protecting it */ /* * Workqueue item so that we can coalesce multiple inode flush attempts * into a single flush. */ struct work_struct m_flush_inodes_work; - struct workqueue_struct *m_buf_workqueue; - struct workqueue_struct *m_unwritten_workqueue; - struct workqueue_struct *m_cil_workqueue; - struct workqueue_struct *m_reclaim_workqueue; - struct workqueue_struct *m_eofblocks_workqueue; - struct workqueue_struct *m_sync_workqueue; /* * Generation of the filesysyem layout. This is incremented by each @@ -190,9 +206,8 @@ typedef struct xfs_mount { * to various other kinds of pain inflicted on the pNFS server. */ uint32_t m_generation; + struct mutex m_growlock; /* growfs mutex */ - bool m_always_cow; - bool m_fail_unmount; #ifdef DEBUG /* * Frequency with which errors are injected. Replaces xfs_etest; the @@ -237,8 +252,8 @@ typedef struct xfs_mount { #define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams allocator */ #define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ - -#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */ +#define XFS_MOUNT_DAX_ALWAYS (1ULL << 26) +#define XFS_MOUNT_DAX_NEVER (1ULL << 27) /* * Max and min values for mount-option defined I/O @@ -259,8 +274,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, #define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ #define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */ #define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */ -#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */ -#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */ /* * Flags for xfs_mountfs @@ -394,8 +407,6 @@ extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, xfs_agnumber_t *maxagi); extern void xfs_unmountfs(xfs_mount_t *); -extern int xfs_mod_icount(struct xfs_mount *mp, int64_t delta); -extern int xfs_mod_ifree(struct xfs_mount *mp, int64_t delta); extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved); extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index bb3008d390aa..b101feb2aab4 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -58,9 +58,8 @@ xfs_fs_get_uuid( { struct xfs_mount *mp = XFS_M(sb); - printk_once(KERN_NOTICE -"XFS (%s): using experimental pNFS feature, use at your own risk!\n", - mp->m_super->s_id); + xfs_notice_once(mp, +"Using experimental pNFS feature, use at your own risk!"); if (*len < sizeof(uuid_t)) return -EINVAL; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index c225691fad15..d6cd83317344 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -558,7 +558,7 @@ xfs_qm_set_defquota( return; ddqp = &dqp->q_core; - defq = xfs_get_defquota(dqp, qinf); + defq = xfs_get_defquota(qinf, xfs_dquot_type(dqp)); /* * Timers and warnings have been already set, let's just set the @@ -577,19 +577,22 @@ xfs_qm_set_defquota( static void xfs_qm_init_timelimits( struct xfs_mount *mp, - struct xfs_quotainfo *qinf) + uint type) { + struct xfs_quotainfo *qinf = mp->m_quotainfo; + struct xfs_def_quota *defq; struct xfs_disk_dquot *ddqp; struct xfs_dquot *dqp; - uint type; int error; - qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; - qinf->qi_itimelimit = XFS_QM_ITIMELIMIT; - qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT; - qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT; - qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT; - qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; + defq = xfs_get_defquota(qinf, type); + + defq->btimelimit = XFS_QM_BTIMELIMIT; + defq->itimelimit = XFS_QM_ITIMELIMIT; + defq->rtbtimelimit = XFS_QM_RTBTIMELIMIT; + defq->bwarnlimit = XFS_QM_BWARNLIMIT; + defq->iwarnlimit = XFS_QM_IWARNLIMIT; + defq->rtbwarnlimit = XFS_QM_RTBWARNLIMIT; /* * We try to get the limits from the superuser's limits fields. @@ -597,39 +600,30 @@ xfs_qm_init_timelimits( * * Since we may not have done a quotacheck by this point, just read * the dquot without attaching it to any hashtables or lists. - * - * Timers and warnings are globally set by the first timer found in - * user/group/proj quota types, otherwise a default value is used. - * This should be split into different fields per quota type. */ - if (XFS_IS_UQUOTA_RUNNING(mp)) - type = XFS_DQ_USER; - else if (XFS_IS_GQUOTA_RUNNING(mp)) - type = XFS_DQ_GROUP; - else - type = XFS_DQ_PROJ; error = xfs_qm_dqget_uncached(mp, 0, type, &dqp); if (error) return; ddqp = &dqp->q_core; + /* * The warnings and timers set the grace period given to * a user or group before he or she can not perform any * more writing. If it is zero, a default is used. */ if (ddqp->d_btimer) - qinf->qi_btimelimit = be32_to_cpu(ddqp->d_btimer); + defq->btimelimit = be32_to_cpu(ddqp->d_btimer); if (ddqp->d_itimer) - qinf->qi_itimelimit = be32_to_cpu(ddqp->d_itimer); + defq->itimelimit = be32_to_cpu(ddqp->d_itimer); if (ddqp->d_rtbtimer) - qinf->qi_rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer); + defq->rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer); if (ddqp->d_bwarns) - qinf->qi_bwarnlimit = be16_to_cpu(ddqp->d_bwarns); + defq->bwarnlimit = be16_to_cpu(ddqp->d_bwarns); if (ddqp->d_iwarns) - qinf->qi_iwarnlimit = be16_to_cpu(ddqp->d_iwarns); + defq->iwarnlimit = be16_to_cpu(ddqp->d_iwarns); if (ddqp->d_rtbwarns) - qinf->qi_rtbwarnlimit = be16_to_cpu(ddqp->d_rtbwarns); + defq->rtbwarnlimit = be16_to_cpu(ddqp->d_rtbwarns); xfs_qm_dqdestroy(dqp); } @@ -675,7 +669,9 @@ xfs_qm_init_quotainfo( mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); - xfs_qm_init_timelimits(mp, qinf); + xfs_qm_init_timelimits(mp, XFS_DQ_USER); + xfs_qm_init_timelimits(mp, XFS_DQ_GROUP); + xfs_qm_init_timelimits(mp, XFS_DQ_PROJ); if (XFS_IS_UQUOTA_RUNNING(mp)) xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf); @@ -780,7 +776,8 @@ xfs_qm_qino_alloc( } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create, - XFS_QM_QINOCREATE_SPACE_RES(mp), 0, 0, &tp); + need_alloc ? XFS_QM_QINOCREATE_SPACE_RES(mp) : 0, + 0, 0, &tp); if (error) return error; @@ -1116,7 +1113,7 @@ xfs_qm_quotacheck_dqadjust( */ if (dqp->q_core.d_id) { xfs_qm_adjust_dqlimits(mp, dqp); - xfs_qm_adjust_dqtimers(mp, &dqp->q_core); + xfs_qm_adjust_dqtimers(mp, dqp); } dqp->dq_flags |= XFS_DQ_DIRTY; @@ -1730,8 +1727,7 @@ xfs_qm_vop_dqalloc( pq = xfs_qm_dqhold(ip->i_pdquot); } } - if (uq) - trace_xfs_dquot_dqalloc(ip); + trace_xfs_dquot_dqalloc(ip); xfs_iunlock(ip, lockflags); if (O_udqpp) @@ -1808,7 +1804,7 @@ xfs_qm_vop_chown_reserve( { struct xfs_mount *mp = ip->i_mount; uint64_t delblks; - unsigned int blkflags, prjflags = 0; + unsigned int blkflags; struct xfs_dquot *udq_unres = NULL; struct xfs_dquot *gdq_unres = NULL; struct xfs_dquot *pdq_unres = NULL; @@ -1849,7 +1845,6 @@ xfs_qm_vop_chown_reserve( if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && ip->i_d.di_projid != be32_to_cpu(pdqp->q_core.d_id)) { - prjflags = XFS_QMOPT_ENOSPC; pdq_delblks = pdqp; if (delblks) { ASSERT(ip->i_pdquot); @@ -1859,8 +1854,7 @@ xfs_qm_vop_chown_reserve( error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, udq_delblks, gdq_delblks, pdq_delblks, - ip->i_d.di_nblocks, 1, - flags | blkflags | prjflags); + ip->i_d.di_nblocks, 1, flags | blkflags); if (error) return error; @@ -1878,8 +1872,7 @@ xfs_qm_vop_chown_reserve( ASSERT(udq_unres || gdq_unres || pdq_unres); error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, udq_delblks, gdq_delblks, pdq_delblks, - (xfs_qcnt_t)delblks, 0, - flags | blkflags | prjflags); + (xfs_qcnt_t)delblks, 0, flags | blkflags); if (error) return error; xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, @@ -1932,7 +1925,6 @@ xfs_qm_vop_create_dqattach( return; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 4e57edca8bce..7b0e771fcbce 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -41,13 +41,20 @@ extern struct kmem_zone *xfs_qm_dqtrxzone; */ #define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 +/* Defaults for each quota type: time limits, warn limits, usage limits */ struct xfs_def_quota { - xfs_qcnt_t bhardlimit; /* default data blk hard limit */ - xfs_qcnt_t bsoftlimit; /* default data blk soft limit */ - xfs_qcnt_t ihardlimit; /* default inode count hard limit */ - xfs_qcnt_t isoftlimit; /* default inode count soft limit */ - xfs_qcnt_t rtbhardlimit; /* default realtime blk hard limit */ - xfs_qcnt_t rtbsoftlimit; /* default realtime blk soft limit */ + time64_t btimelimit; /* limit for blks timer */ + time64_t itimelimit; /* limit for inodes timer */ + time64_t rtbtimelimit; /* limit for rt blks timer */ + xfs_qwarncnt_t bwarnlimit; /* limit for blks warnings */ + xfs_qwarncnt_t iwarnlimit; /* limit for inodes warnings */ + xfs_qwarncnt_t rtbwarnlimit; /* limit for rt blks warnings */ + xfs_qcnt_t bhardlimit; /* default data blk hard limit */ + xfs_qcnt_t bsoftlimit; /* default data blk soft limit */ + xfs_qcnt_t ihardlimit; /* default inode count hard limit */ + xfs_qcnt_t isoftlimit; /* default inode count soft limit */ + xfs_qcnt_t rtbhardlimit; /* default realtime blk hard limit */ + xfs_qcnt_t rtbsoftlimit; /* default realtime blk soft limit */ }; /* @@ -55,28 +62,22 @@ struct xfs_def_quota { * The mount structure keeps a pointer to this. */ struct xfs_quotainfo { - struct radix_tree_root qi_uquota_tree; - struct radix_tree_root qi_gquota_tree; - struct radix_tree_root qi_pquota_tree; - struct mutex qi_tree_lock; + struct radix_tree_root qi_uquota_tree; + struct radix_tree_root qi_gquota_tree; + struct radix_tree_root qi_pquota_tree; + struct mutex qi_tree_lock; struct xfs_inode *qi_uquotaip; /* user quota inode */ struct xfs_inode *qi_gquotaip; /* group quota inode */ struct xfs_inode *qi_pquotaip; /* project quota inode */ - struct list_lru qi_lru; - int qi_dquots; - time64_t qi_btimelimit; /* limit for blks timer */ - time64_t qi_itimelimit; /* limit for inodes timer */ - time64_t qi_rtbtimelimit;/* limit for rt blks timer */ - xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */ - xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */ - xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */ - struct mutex qi_quotaofflock;/* to serialize quotaoff */ - xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ - uint qi_dqperchunk; /* # ondisk dqs in above chunk */ + struct list_lru qi_lru; + int qi_dquots; + struct mutex qi_quotaofflock;/* to serialize quotaoff */ + xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ + uint qi_dqperchunk; /* # ondisk dq in above chunk */ struct xfs_def_quota qi_usr_default; struct xfs_def_quota qi_grp_default; struct xfs_def_quota qi_prj_default; - struct shrinker qi_shrinker; + struct shrinker qi_shrinker; }; static inline struct radix_tree_root * @@ -113,6 +114,17 @@ xfs_quota_inode(xfs_mount_t *mp, uint dq_flags) return NULL; } +static inline int +xfs_dquot_type(struct xfs_dquot *dqp) +{ + if (XFS_QM_ISUDQ(dqp)) + return XFS_DQ_USER; + if (XFS_QM_ISGDQ(dqp)) + return XFS_DQ_GROUP; + ASSERT(XFS_QM_ISPDQ(dqp)); + return XFS_DQ_PROJ; +} + extern void xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp, uint field, int64_t delta); extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *); @@ -164,19 +176,19 @@ extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint); static inline struct xfs_def_quota * -xfs_get_defquota(struct xfs_dquot *dqp, struct xfs_quotainfo *qi) +xfs_get_defquota(struct xfs_quotainfo *qi, int type) { - struct xfs_def_quota *defq; - - if (XFS_QM_ISUDQ(dqp)) - defq = &qi->qi_usr_default; - else if (XFS_QM_ISGDQ(dqp)) - defq = &qi->qi_grp_default; - else { - ASSERT(XFS_QM_ISPDQ(dqp)); - defq = &qi->qi_prj_default; + switch (type) { + case XFS_DQ_USER: + return &qi->qi_usr_default; + case XFS_DQ_GROUP: + return &qi->qi_grp_default; + case XFS_DQ_PROJ: + return &qi->qi_prj_default; + default: + ASSERT(0); + return NULL; } - return defq; } #endif /* __XFS_QM_H__ */ diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 5d5ac65aa1cc..7effd7a28136 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -302,7 +302,7 @@ xfs_qm_scall_trunc_qfile( goto out_unlock; } - ASSERT(ip->i_d.di_nextents == 0); + ASSERT(ip->i_df.if_nextents == 0); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); error = xfs_trans_commit(tp); @@ -357,11 +357,11 @@ xfs_qm_scall_quotaon( int error; uint qf; - flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); /* - * Switching on quota accounting must be done at mount time. + * Switching on quota accounting must be done at mount time, + * only consider quota enforcement stuff here. */ - flags &= ~(XFS_ALL_QUOTA_ACCT); + flags &= XFS_ALL_QUOTA_ENFD; if (flags == 0) { xfs_debug(mp, "%s: zero flags, m_qflags=%x", @@ -479,7 +479,7 @@ xfs_qm_scall_setqlim( goto out_unlock; } - defq = xfs_get_defquota(dqp, q); + defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); xfs_dqunlock(dqp); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp); @@ -555,32 +555,40 @@ xfs_qm_scall_setqlim( ddq->d_rtbwarns = cpu_to_be16(newlim->d_rt_spc_warns); if (id == 0) { - /* - * Timelimits for the super user set the relative time - * the other users can be over quota for this file system. - * If it is zero a default is used. Ditto for the default - * soft and hard limit values (already done, above), and - * for warnings. - */ - if (newlim->d_fieldmask & QC_SPC_TIMER) { - q->qi_btimelimit = newlim->d_spc_timer; - ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer); - } - if (newlim->d_fieldmask & QC_INO_TIMER) { - q->qi_itimelimit = newlim->d_ino_timer; - ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer); - } - if (newlim->d_fieldmask & QC_RT_SPC_TIMER) { - q->qi_rtbtimelimit = newlim->d_rt_spc_timer; - ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer); - } if (newlim->d_fieldmask & QC_SPC_WARNS) - q->qi_bwarnlimit = newlim->d_spc_warns; + defq->bwarnlimit = newlim->d_spc_warns; if (newlim->d_fieldmask & QC_INO_WARNS) - q->qi_iwarnlimit = newlim->d_ino_warns; + defq->iwarnlimit = newlim->d_ino_warns; if (newlim->d_fieldmask & QC_RT_SPC_WARNS) - q->qi_rtbwarnlimit = newlim->d_rt_spc_warns; - } else { + defq->rtbwarnlimit = newlim->d_rt_spc_warns; + } + + /* + * Timelimits for the super user set the relative time the other users + * can be over quota for this file system. If it is zero a default is + * used. Ditto for the default soft and hard limit values (already + * done, above), and for warnings. + * + * For other IDs, userspace can bump out the grace period if over + * the soft limit. + */ + if (newlim->d_fieldmask & QC_SPC_TIMER) + ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer); + if (newlim->d_fieldmask & QC_INO_TIMER) + ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer); + if (newlim->d_fieldmask & QC_RT_SPC_TIMER) + ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer); + + if (id == 0) { + if (newlim->d_fieldmask & QC_SPC_TIMER) + defq->btimelimit = newlim->d_spc_timer; + if (newlim->d_fieldmask & QC_INO_TIMER) + defq->itimelimit = newlim->d_ino_timer; + if (newlim->d_fieldmask & QC_RT_SPC_TIMER) + defq->rtbtimelimit = newlim->d_rt_spc_timer; + } + + if (id != 0) { /* * If the user is now over quota, start the timelimit. * The user will not be 'warned'. @@ -588,7 +596,7 @@ xfs_qm_scall_setqlim( * is on or off. We don't really want to bother with iterating * over all ondisk dquots and turning the timers on/off. */ - xfs_qm_adjust_dqtimers(mp, ddq); + xfs_qm_adjust_dqtimers(mp, dqp); } dqp->dq_flags |= XFS_DQ_DIRTY; xfs_trans_log_dquot(tp, dqp); @@ -729,9 +737,10 @@ xfs_qm_scall_getquota_next( STATIC int xfs_dqrele_inode( struct xfs_inode *ip, - int flags, void *args) { + uint *flags = args; + /* skip quota inodes */ if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || ip == ip->i_mount->m_quotainfo->qi_gquotaip || @@ -743,15 +752,15 @@ xfs_dqrele_inode( } xfs_ilock(ip, XFS_ILOCK_EXCL); - if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { + if ((*flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { xfs_qm_dqrele(ip->i_udquot); ip->i_udquot = NULL; } - if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) { + if ((*flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) { xfs_qm_dqrele(ip->i_gdquot); ip->i_gdquot = NULL; } - if ((flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) { + if ((*flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) { xfs_qm_dqrele(ip->i_pdquot); ip->i_pdquot = NULL; } @@ -768,10 +777,10 @@ xfs_dqrele_inode( */ void xfs_qm_dqrele_all_inodes( - struct xfs_mount *mp, - uint flags) + struct xfs_mount *mp, + uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator_flags(mp, xfs_dqrele_inode, flags, NULL, - XFS_AGITER_INEW_WAIT); + xfs_inode_walk(mp, XFS_INODE_WALK_INEW_WAIT, xfs_dqrele_inode, + &flags, XFS_ICI_NO_TAG); } diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 38669e827206..bf809b77a316 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -21,10 +21,10 @@ xfs_qm_fill_state( struct qc_type_state *tstate, struct xfs_mount *mp, struct xfs_inode *ip, - xfs_ino_t ino) + xfs_ino_t ino, + struct xfs_def_quota *defq) { - struct xfs_quotainfo *q = mp->m_quotainfo; - bool tempqip = false; + bool tempqip = false; tstate->ino = ino; if (!ip && ino == NULLFSINO) @@ -36,13 +36,13 @@ xfs_qm_fill_state( } tstate->flags |= QCI_SYSFILE; tstate->blocks = ip->i_d.di_nblocks; - tstate->nextents = ip->i_d.di_nextents; - tstate->spc_timelimit = (u32)q->qi_btimelimit; - tstate->ino_timelimit = (u32)q->qi_itimelimit; - tstate->rt_spc_timelimit = (u32)q->qi_rtbtimelimit; - tstate->spc_warnlimit = q->qi_bwarnlimit; - tstate->ino_warnlimit = q->qi_iwarnlimit; - tstate->rt_spc_warnlimit = q->qi_rtbwarnlimit; + tstate->nextents = ip->i_df.if_nextents; + tstate->spc_timelimit = (u32)defq->btimelimit; + tstate->ino_timelimit = (u32)defq->itimelimit; + tstate->rt_spc_timelimit = (u32)defq->rtbtimelimit; + tstate->spc_warnlimit = defq->bwarnlimit; + tstate->ino_warnlimit = defq->iwarnlimit; + tstate->rt_spc_warnlimit = defq->rtbwarnlimit; if (tempqip) xfs_irele(ip); } @@ -77,11 +77,11 @@ xfs_fs_get_quota_state( state->s_state[PRJQUOTA].flags |= QCI_LIMITS_ENFORCED; xfs_qm_fill_state(&state->s_state[USRQUOTA], mp, q->qi_uquotaip, - mp->m_sb.sb_uquotino); + mp->m_sb.sb_uquotino, &q->qi_usr_default); xfs_qm_fill_state(&state->s_state[GRPQUOTA], mp, q->qi_gquotaip, - mp->m_sb.sb_gquotino); + mp->m_sb.sb_gquotino, &q->qi_grp_default); xfs_qm_fill_state(&state->s_state[PRJQUOTA], mp, q->qi_pquotaip, - mp->m_sb.sb_pquotino); + mp->m_sb.sb_pquotino, &q->qi_prj_default); return 0; } @@ -109,8 +109,8 @@ xfs_fs_set_info( int type, struct qc_info *info) { - struct xfs_mount *mp = XFS_M(sb); - struct qc_dqblk newlim; + struct xfs_mount *mp = XFS_M(sb); + struct qc_dqblk newlim; if (sb_rdonly(sb)) return -EROFS; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 8eeed73928cd..c81639891e29 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -18,16 +18,20 @@ #include "xfs_log.h" #include "xfs_refcount.h" #include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" kmem_zone_t *xfs_cui_zone; kmem_zone_t *xfs_cud_zone; +static const struct xfs_item_ops xfs_cui_item_ops; + static inline struct xfs_cui_log_item *CUI_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_cui_log_item, cui_item); } -void +STATIC void xfs_cui_item_free( struct xfs_cui_log_item *cuip) { @@ -44,13 +48,13 @@ xfs_cui_item_free( * committed vs unpin operations in bulk insert operations. Hence the reference * count to ensure only the last caller frees the CUI. */ -void +STATIC void xfs_cui_release( struct xfs_cui_log_item *cuip) { ASSERT(atomic_read(&cuip->cui_refcount) > 0); if (atomic_dec_and_test(&cuip->cui_refcount)) { - xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); xfs_cui_item_free(cuip); } } @@ -123,17 +127,10 @@ xfs_cui_item_release( xfs_cui_release(CUI_ITEM(lip)); } -static const struct xfs_item_ops xfs_cui_item_ops = { - .iop_size = xfs_cui_item_size, - .iop_format = xfs_cui_item_format, - .iop_unpin = xfs_cui_item_unpin, - .iop_release = xfs_cui_item_release, -}; - /* * Allocate and initialize an cui item with the given number of extents. */ -struct xfs_cui_log_item * +STATIC struct xfs_cui_log_item * xfs_cui_init( struct xfs_mount *mp, uint nextents) @@ -284,27 +281,6 @@ xfs_refcount_update_diff_items( XFS_FSB_TO_AGNO(mp, rb->ri_startblock); } -/* Get an CUI. */ -STATIC void * -xfs_refcount_update_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_cui_log_item *cuip; - - ASSERT(tp != NULL); - ASSERT(count > 0); - - cuip = xfs_cui_init(tp->t_mountp, count); - ASSERT(cuip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &cuip->cui_item); - return cuip; -} - /* Set the phys extent flags for this reverse mapping. */ static void xfs_trans_set_refcount_flags( @@ -328,16 +304,12 @@ xfs_trans_set_refcount_flags( STATIC void xfs_refcount_update_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_cui_log_item *cuip, + struct xfs_refcount_intent *refc) { - struct xfs_cui_log_item *cuip = intent; - struct xfs_refcount_intent *refc; uint next_extent; struct xfs_phys_extent *ext; - refc = container_of(item, struct xfs_refcount_intent, ri_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); @@ -354,23 +326,44 @@ xfs_refcount_update_log_item( xfs_trans_set_refcount_flags(ext, refc->ri_type); } +static struct xfs_log_item * +xfs_refcount_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count); + struct xfs_refcount_intent *refc; + + ASSERT(count > 0); + + xfs_trans_add_item(tp, &cuip->cui_item); + if (sort) + list_sort(mp, items, xfs_refcount_update_diff_items); + list_for_each_entry(refc, items, ri_list) + xfs_refcount_update_log_item(tp, cuip, refc); + return &cuip->cui_item; +} + /* Get an CUD so we can process all the deferred refcount updates. */ -STATIC void * +static struct xfs_log_item * xfs_refcount_update_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_cud(tp, intent); + return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item; } /* Process a deferred refcount update. */ STATIC int xfs_refcount_update_finish_item( struct xfs_trans *tp, + struct xfs_log_item *done, struct list_head *item, - void *done_item, - void **state) + struct xfs_btree_cur **state) { struct xfs_refcount_intent *refc; xfs_fsblock_t new_fsb; @@ -378,12 +371,10 @@ xfs_refcount_update_finish_item( int error; refc = container_of(item, struct xfs_refcount_intent, ri_list); - error = xfs_trans_log_finish_refcount_update(tp, done_item, - refc->ri_type, - refc->ri_startblock, - refc->ri_blockcount, - &new_fsb, &new_aglen, - (struct xfs_btree_cur **)state); + error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), + refc->ri_type, refc->ri_startblock, refc->ri_blockcount, + &new_fsb, &new_aglen, state); + /* Did we run out of reservation? Requeue what we didn't finish. */ if (!error && new_aglen > 0) { ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE || @@ -396,24 +387,12 @@ xfs_refcount_update_finish_item( return error; } -/* Clean up after processing deferred refcounts. */ -STATIC void -xfs_refcount_update_finish_cleanup( - struct xfs_trans *tp, - void *state, - int error) -{ - struct xfs_btree_cur *rcur = state; - - xfs_refcount_finish_one_cleanup(tp, rcur, error); -} - /* Abort all pending CUIs. */ STATIC void xfs_refcount_update_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_cui_release(intent); + xfs_cui_release(CUI_ITEM(intent)); } /* Cancel a deferred refcount update. */ @@ -429,13 +408,11 @@ xfs_refcount_update_cancel_item( const struct xfs_defer_op_type xfs_refcount_update_defer_type = { .max_items = XFS_CUI_MAX_FAST_EXTENTS, - .diff_items = xfs_refcount_update_diff_items, .create_intent = xfs_refcount_update_create_intent, .abort_intent = xfs_refcount_update_abort_intent, - .log_item = xfs_refcount_update_log_item, .create_done = xfs_refcount_update_create_done, .finish_item = xfs_refcount_update_finish_item, - .finish_cleanup = xfs_refcount_update_finish_cleanup, + .finish_cleanup = xfs_refcount_finish_one_cleanup, .cancel_item = xfs_refcount_update_cancel_item, }; @@ -443,28 +420,27 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = { * Process a refcount update intent item that was recovered from the log. * We need to update the refcountbt. */ -int -xfs_cui_recover( - struct xfs_trans *parent_tp, - struct xfs_cui_log_item *cuip) +STATIC int +xfs_cui_item_recover( + struct xfs_log_item *lip, + struct xfs_trans *parent_tp) { - int i; - int error = 0; - unsigned int refc_type; + struct xfs_bmbt_irec irec; + struct xfs_cui_log_item *cuip = CUI_ITEM(lip); struct xfs_phys_extent *refc; - xfs_fsblock_t startblock_fsb; - bool op_ok; struct xfs_cud_log_item *cudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - enum xfs_refcount_intent_type type; + struct xfs_mount *mp = parent_tp->t_mountp; + xfs_fsblock_t startblock_fsb; xfs_fsblock_t new_fsb; xfs_extlen_t new_len; - struct xfs_bmbt_irec irec; + unsigned int refc_type; + bool op_ok; bool requeue_only = false; - struct xfs_mount *mp = parent_tp->t_mountp; - - ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)); + enum xfs_refcount_intent_type type; + int i; + int error = 0; /* * First check the validity of the extents described by the @@ -495,7 +471,6 @@ xfs_cui_recover( * This will pull the CUI from the AIL and * free the memory associated with it. */ - set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); xfs_cui_release(cuip); return -EFSCORRUPTED; } @@ -579,7 +554,6 @@ xfs_cui_recover( } xfs_refcount_finish_one_cleanup(tp, rcur, error); - set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); xfs_defer_move(parent_tp, tp); error = xfs_trans_commit(tp); return error; @@ -590,3 +564,117 @@ abort_error: xfs_trans_cancel(tp); return error; } + +STATIC bool +xfs_cui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return CUI_ITEM(lip)->cui_format.cui_id == intent_id; +} + +static const struct xfs_item_ops xfs_cui_item_ops = { + .iop_size = xfs_cui_item_size, + .iop_format = xfs_cui_item_format, + .iop_unpin = xfs_cui_item_unpin, + .iop_release = xfs_cui_item_release, + .iop_recover = xfs_cui_item_recover, + .iop_match = xfs_cui_item_match, +}; + +/* + * Copy an CUI format buffer from the given buf, and into the destination + * CUI format structure. The CUI/CUD items were designed not to need any + * special alignment handling. + */ +static int +xfs_cui_copy_format( + struct xfs_log_iovec *buf, + struct xfs_cui_log_format *dst_cui_fmt) +{ + struct xfs_cui_log_format *src_cui_fmt; + uint len; + + src_cui_fmt = buf->i_addr; + len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents); + + if (buf->i_len == len) { + memcpy(dst_cui_fmt, src_cui_fmt, len); + return 0; + } + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); + return -EFSCORRUPTED; +} + +/* + * This routine is called to create an in-core extent refcount update + * item from the cui format structure which was logged on disk. + * It allocates an in-core cui, copies the extents from the format + * structure into it, and adds the cui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_cui_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_cui_log_item *cuip; + struct xfs_cui_log_format *cui_formatp; + + cui_formatp = item->ri_buf[0].i_addr; + + cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); + error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format); + if (error) { + xfs_cui_item_free(cuip); + return error; + } + atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); + /* + * Insert the intent into the AIL directly and drop one reference so + * that finishing or canceling the work will drop the other. + */ + xfs_trans_ail_insert(log->l_ailp, &cuip->cui_item, lsn); + xfs_cui_release(cuip); + return 0; +} + +const struct xlog_recover_item_ops xlog_cui_item_ops = { + .item_type = XFS_LI_CUI, + .commit_pass2 = xlog_recover_cui_commit_pass2, +}; + +/* + * This routine is called when an CUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding CUI if it + * was still in the log. To do this it searches the AIL for the CUI with an id + * equal to that in the CUD format structure. If we find it we drop the CUD + * reference, which removes the CUI from the AIL and frees it. + */ +STATIC int +xlog_recover_cud_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_cud_log_format *cud_formatp; + + cud_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); + return -EFSCORRUPTED; + } + + xlog_recover_release_intent(log, XFS_LI_CUI, cud_formatp->cud_cui_id); + return 0; +} + +const struct xlog_recover_item_ops xlog_cud_item_ops = { + .item_type = XFS_LI_CUD, + .commit_pass2 = xlog_recover_cud_commit_pass2, +}; diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h index e47530f30489..f4f2e836540b 100644 --- a/fs/xfs/xfs_refcount_item.h +++ b/fs/xfs/xfs_refcount_item.h @@ -33,11 +33,6 @@ struct kmem_zone; #define XFS_CUI_MAX_FAST_EXTENTS 16 /* - * Define CUI flag bits. Manipulated by set/clear/test_bit operators. - */ -#define XFS_CUI_RECOVERED 1 - -/* * This is the "refcount update intent" log item. It is used to log * the fact that some reverse mappings need to change. It is used in * conjunction with the "refcount update done" log item described @@ -51,7 +46,6 @@ struct xfs_cui_log_item { struct xfs_log_item cui_item; atomic_t cui_refcount; atomic_t cui_next_extent; - unsigned long cui_flags; /* misc flags */ struct xfs_cui_log_format cui_format; }; @@ -77,9 +71,4 @@ struct xfs_cud_log_item { extern struct kmem_zone *xfs_cui_zone; extern struct kmem_zone *xfs_cud_zone; -struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint); -void xfs_cui_item_free(struct xfs_cui_log_item *); -void xfs_cui_release(struct xfs_cui_log_item *); -int xfs_cui_recover(struct xfs_trans *parent_tp, struct xfs_cui_log_item *cuip); - #endif /* __XFS_REFCOUNT_ITEM_H__ */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 4911b68f95dd..a86599db20a6 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -18,16 +18,20 @@ #include "xfs_log.h" #include "xfs_rmap.h" #include "xfs_error.h" +#include "xfs_log_priv.h" +#include "xfs_log_recover.h" kmem_zone_t *xfs_rui_zone; kmem_zone_t *xfs_rud_zone; +static const struct xfs_item_ops xfs_rui_item_ops; + static inline struct xfs_rui_log_item *RUI_ITEM(struct xfs_log_item *lip) { return container_of(lip, struct xfs_rui_log_item, rui_item); } -void +STATIC void xfs_rui_item_free( struct xfs_rui_log_item *ruip) { @@ -44,13 +48,13 @@ xfs_rui_item_free( * committed vs unpin operations in bulk insert operations. Hence the reference * count to ensure only the last caller frees the RUI. */ -void +STATIC void xfs_rui_release( struct xfs_rui_log_item *ruip) { ASSERT(atomic_read(&ruip->rui_refcount) > 0); if (atomic_dec_and_test(&ruip->rui_refcount)) { - xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_trans_ail_delete(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); xfs_rui_item_free(ruip); } } @@ -122,17 +126,10 @@ xfs_rui_item_release( xfs_rui_release(RUI_ITEM(lip)); } -static const struct xfs_item_ops xfs_rui_item_ops = { - .iop_size = xfs_rui_item_size, - .iop_format = xfs_rui_item_format, - .iop_unpin = xfs_rui_item_unpin, - .iop_release = xfs_rui_item_release, -}; - /* * Allocate and initialize an rui item with the given number of extents. */ -struct xfs_rui_log_item * +STATIC struct xfs_rui_log_item * xfs_rui_init( struct xfs_mount *mp, uint nextents) @@ -160,7 +157,7 @@ xfs_rui_init( * RUI format structure. The RUI/RUD items were designed not to need any * special alignment handling. */ -int +STATIC int xfs_rui_copy_format( struct xfs_log_iovec *buf, struct xfs_rui_log_format *dst_rui_fmt) @@ -352,41 +349,16 @@ xfs_rmap_update_diff_items( XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock); } -/* Get an RUI. */ -STATIC void * -xfs_rmap_update_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_rui_log_item *ruip; - - ASSERT(tp != NULL); - ASSERT(count > 0); - - ruip = xfs_rui_init(tp->t_mountp, count); - ASSERT(ruip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &ruip->rui_item); - return ruip; -} - /* Log rmap updates in the intent item. */ STATIC void xfs_rmap_update_log_item( struct xfs_trans *tp, - void *intent, - struct list_head *item) + struct xfs_rui_log_item *ruip, + struct xfs_rmap_intent *rmap) { - struct xfs_rui_log_item *ruip = intent; - struct xfs_rmap_intent *rmap; uint next_extent; struct xfs_map_extent *map; - rmap = container_of(item, struct xfs_rmap_intent, ri_list); - tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); @@ -406,58 +378,64 @@ xfs_rmap_update_log_item( rmap->ri_bmap.br_state); } +static struct xfs_log_item * +xfs_rmap_update_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count); + struct xfs_rmap_intent *rmap; + + ASSERT(count > 0); + + xfs_trans_add_item(tp, &ruip->rui_item); + if (sort) + list_sort(mp, items, xfs_rmap_update_diff_items); + list_for_each_entry(rmap, items, ri_list) + xfs_rmap_update_log_item(tp, ruip, rmap); + return &ruip->rui_item; +} + /* Get an RUD so we can process all the deferred rmap updates. */ -STATIC void * +static struct xfs_log_item * xfs_rmap_update_create_done( struct xfs_trans *tp, - void *intent, + struct xfs_log_item *intent, unsigned int count) { - return xfs_trans_get_rud(tp, intent); + return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item; } /* Process a deferred rmap update. */ STATIC int xfs_rmap_update_finish_item( struct xfs_trans *tp, + struct xfs_log_item *done, struct list_head *item, - void *done_item, - void **state) + struct xfs_btree_cur **state) { struct xfs_rmap_intent *rmap; int error; rmap = container_of(item, struct xfs_rmap_intent, ri_list); - error = xfs_trans_log_finish_rmap_update(tp, done_item, - rmap->ri_type, - rmap->ri_owner, rmap->ri_whichfork, - rmap->ri_bmap.br_startoff, - rmap->ri_bmap.br_startblock, - rmap->ri_bmap.br_blockcount, - rmap->ri_bmap.br_state, - (struct xfs_btree_cur **)state); + error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), + rmap->ri_type, rmap->ri_owner, rmap->ri_whichfork, + rmap->ri_bmap.br_startoff, rmap->ri_bmap.br_startblock, + rmap->ri_bmap.br_blockcount, rmap->ri_bmap.br_state, + state); kmem_free(rmap); return error; } -/* Clean up after processing deferred rmaps. */ -STATIC void -xfs_rmap_update_finish_cleanup( - struct xfs_trans *tp, - void *state, - int error) -{ - struct xfs_btree_cur *rcur = state; - - xfs_rmap_finish_one_cleanup(tp, rcur, error); -} - /* Abort all pending RUIs. */ STATIC void xfs_rmap_update_abort_intent( - void *intent) + struct xfs_log_item *intent) { - xfs_rui_release(intent); + xfs_rui_release(RUI_ITEM(intent)); } /* Cancel a deferred rmap update. */ @@ -473,13 +451,11 @@ xfs_rmap_update_cancel_item( const struct xfs_defer_op_type xfs_rmap_update_defer_type = { .max_items = XFS_RUI_MAX_FAST_EXTENTS, - .diff_items = xfs_rmap_update_diff_items, .create_intent = xfs_rmap_update_create_intent, .abort_intent = xfs_rmap_update_abort_intent, - .log_item = xfs_rmap_update_log_item, .create_done = xfs_rmap_update_create_done, .finish_item = xfs_rmap_update_finish_item, - .finish_cleanup = xfs_rmap_update_finish_cleanup, + .finish_cleanup = xfs_rmap_finish_one_cleanup, .cancel_item = xfs_rmap_update_cancel_item, }; @@ -487,24 +463,24 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = { * Process an rmap update intent item that was recovered from the log. * We need to update the rmapbt. */ -int -xfs_rui_recover( - struct xfs_mount *mp, - struct xfs_rui_log_item *ruip) +STATIC int +xfs_rui_item_recover( + struct xfs_log_item *lip, + struct xfs_trans *parent_tp) { - int i; - int error = 0; + struct xfs_rui_log_item *ruip = RUI_ITEM(lip); struct xfs_map_extent *rmap; - xfs_fsblock_t startblock_fsb; - bool op_ok; struct xfs_rud_log_item *rudp; - enum xfs_rmap_intent_type type; - int whichfork; - xfs_exntst_t state; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - - ASSERT(!test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags)); + struct xfs_mount *mp = parent_tp->t_mountp; + xfs_fsblock_t startblock_fsb; + enum xfs_rmap_intent_type type; + xfs_exntst_t state; + bool op_ok; + int i; + int whichfork; + int error = 0; /* * First check the validity of the extents described by the @@ -539,7 +515,6 @@ xfs_rui_recover( * This will pull the RUI from the AIL and * free the memory associated with it. */ - set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags); xfs_rui_release(ruip); return -EFSCORRUPTED; } @@ -597,7 +572,6 @@ xfs_rui_recover( } xfs_rmap_finish_one_cleanup(tp, rcur, error); - set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags); error = xfs_trans_commit(tp); return error; @@ -606,3 +580,90 @@ abort_error: xfs_trans_cancel(tp); return error; } + +STATIC bool +xfs_rui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return RUI_ITEM(lip)->rui_format.rui_id == intent_id; +} + +static const struct xfs_item_ops xfs_rui_item_ops = { + .iop_size = xfs_rui_item_size, + .iop_format = xfs_rui_item_format, + .iop_unpin = xfs_rui_item_unpin, + .iop_release = xfs_rui_item_release, + .iop_recover = xfs_rui_item_recover, + .iop_match = xfs_rui_item_match, +}; + +/* + * This routine is called to create an in-core extent rmap update + * item from the rui format structure which was logged on disk. + * It allocates an in-core rui, copies the extents from the format + * structure into it, and adds the rui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_rui_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_rui_log_item *ruip; + struct xfs_rui_log_format *rui_formatp; + + rui_formatp = item->ri_buf[0].i_addr; + + ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); + error = xfs_rui_copy_format(&item->ri_buf[0], &ruip->rui_format); + if (error) { + xfs_rui_item_free(ruip); + return error; + } + atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); + /* + * Insert the intent into the AIL directly and drop one reference so + * that finishing or canceling the work will drop the other. + */ + xfs_trans_ail_insert(log->l_ailp, &ruip->rui_item, lsn); + xfs_rui_release(ruip); + return 0; +} + +const struct xlog_recover_item_ops xlog_rui_item_ops = { + .item_type = XFS_LI_RUI, + .commit_pass2 = xlog_recover_rui_commit_pass2, +}; + +/* + * This routine is called when an RUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding RUI if it + * was still in the log. To do this it searches the AIL for the RUI with an id + * equal to that in the RUD format structure. If we find it we drop the RUD + * reference, which removes the RUI from the AIL and frees it. + */ +STATIC int +xlog_recover_rud_commit_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + struct xfs_rud_log_format *rud_formatp; + + rud_formatp = item->ri_buf[0].i_addr; + ASSERT(item->ri_buf[0].i_len == sizeof(struct xfs_rud_log_format)); + + xlog_recover_release_intent(log, XFS_LI_RUI, rud_formatp->rud_rui_id); + return 0; +} + +const struct xlog_recover_item_ops xlog_rud_item_ops = { + .item_type = XFS_LI_RUD, + .commit_pass2 = xlog_recover_rud_commit_pass2, +}; diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h index 8708e4a5aa5c..31e6cdfff71f 100644 --- a/fs/xfs/xfs_rmap_item.h +++ b/fs/xfs/xfs_rmap_item.h @@ -36,11 +36,6 @@ struct kmem_zone; #define XFS_RUI_MAX_FAST_EXTENTS 16 /* - * Define RUI flag bits. Manipulated by set/clear/test_bit operators. - */ -#define XFS_RUI_RECOVERED 1 - -/* * This is the "rmap update intent" log item. It is used to log the fact that * some reverse mappings need to change. It is used in conjunction with the * "rmap update done" log item described below. @@ -52,7 +47,6 @@ struct xfs_rui_log_item { struct xfs_log_item rui_item; atomic_t rui_refcount; atomic_t rui_next_extent; - unsigned long rui_flags; /* misc flags */ struct xfs_rui_log_format rui_format; }; @@ -77,11 +71,4 @@ struct xfs_rud_log_item { extern struct kmem_zone *xfs_rui_zone; extern struct kmem_zone *xfs_rud_zone; -struct xfs_rui_log_item *xfs_rui_init(struct xfs_mount *, uint); -int xfs_rui_copy_format(struct xfs_log_iovec *buf, - struct xfs_rui_log_format *dst_rui_fmt); -void xfs_rui_item_free(struct xfs_rui_log_item *); -void xfs_rui_release(struct xfs_rui_log_item *); -int xfs_rui_recover(struct xfs_mount *mp, struct xfs_rui_log_item *ruip); - #endif /* __XFS_RMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 424bb9a2d532..379cbff438bc 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -47,6 +47,39 @@ static struct kset *xfs_kset; /* top-level xfs sysfs dir */ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #endif +enum xfs_dax_mode { + XFS_DAX_INODE = 0, + XFS_DAX_ALWAYS = 1, + XFS_DAX_NEVER = 2, +}; + +static void +xfs_mount_set_dax_mode( + struct xfs_mount *mp, + enum xfs_dax_mode mode) +{ + switch (mode) { + case XFS_DAX_INODE: + mp->m_flags &= ~(XFS_MOUNT_DAX_ALWAYS | XFS_MOUNT_DAX_NEVER); + break; + case XFS_DAX_ALWAYS: + mp->m_flags |= XFS_MOUNT_DAX_ALWAYS; + mp->m_flags &= ~XFS_MOUNT_DAX_NEVER; + break; + case XFS_DAX_NEVER: + mp->m_flags |= XFS_MOUNT_DAX_NEVER; + mp->m_flags &= ~XFS_MOUNT_DAX_ALWAYS; + break; + } +} + +static const struct constant_table dax_param_enums[] = { + {"inode", XFS_DAX_INODE }, + {"always", XFS_DAX_ALWAYS }, + {"never", XFS_DAX_NEVER }, + {} +}; + /* * Table driven mount option parser. */ @@ -59,7 +92,7 @@ enum { Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, - Opt_discard, Opt_nodiscard, Opt_dax, + Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, }; static const struct fs_parameter_spec xfs_fs_parameters[] = { @@ -103,6 +136,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_flag("discard", Opt_discard), fsparam_flag("nodiscard", Opt_nodiscard), fsparam_flag("dax", Opt_dax), + fsparam_enum("dax", Opt_dax_enum, dax_param_enums), {} }; @@ -129,7 +163,8 @@ xfs_fs_show_options( { XFS_MOUNT_GRPID, ",grpid" }, { XFS_MOUNT_DISCARD, ",discard" }, { XFS_MOUNT_LARGEIO, ",largeio" }, - { XFS_MOUNT_DAX, ",dax" }, + { XFS_MOUNT_DAX_ALWAYS, ",dax=always" }, + { XFS_MOUNT_DAX_NEVER, ",dax=never" }, { 0, NULL } }; struct xfs_mount *mp = XFS_M(root->d_sb); @@ -305,7 +340,7 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS, NULL); + blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS); } STATIC void @@ -702,7 +737,7 @@ xfs_fs_drop_inode( return 0; } - return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE); + return generic_drop_inode(inode); } static void @@ -772,7 +807,8 @@ xfs_fs_statfs( statp->f_blocks = sbp->sb_dblocks - lsize; spin_unlock(&mp->m_sb_lock); - statp->f_bfree = fdblocks - mp->m_alloc_set_aside; + /* make sure statp->f_bfree does not underflow */ + statp->f_bfree = max_t(int64_t, fdblocks - mp->m_alloc_set_aside, 0); statp->f_bavail = statp->f_bfree; fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree); @@ -838,8 +874,10 @@ xfs_restore_resvblks(struct xfs_mount *mp) * there is no log replay required to write the inodes to disk - this is the * primary difference between a sync and a quiesce. * - * Note: xfs_log_quiesce() stops background log work - the callers must ensure - * it is started again when appropriate. + * We cancel log work early here to ensure all transactions the log worker may + * run have finished before we clean up and log the superblock and write an + * unmount record. The unfreeze process is responsible for restarting the log + * worker correctly. */ void xfs_quiesce_attr( @@ -847,9 +885,7 @@ xfs_quiesce_attr( { int error = 0; - /* wait for all modifications to complete */ - while (atomic_read(&mp->m_active_trans) > 0) - delay(100); + cancel_delayed_work_sync(&mp->m_log->l_work); /* force the log to unpin objects from the now complete transactions */ xfs_log_force(mp, XFS_LOG_SYNC); @@ -863,12 +899,6 @@ xfs_quiesce_attr( if (error) xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " "Frozen image may not be consistent."); - /* - * Just warn here till VFS can correctly support - * read-only remount without racing. - */ - WARN_ON(atomic_read(&mp->m_active_trans) != 0); - xfs_log_quiesce(mp); } @@ -1261,7 +1291,10 @@ xfs_fc_parse_param( return 0; #ifdef CONFIG_FS_DAX case Opt_dax: - mp->m_flags |= XFS_MOUNT_DAX; + xfs_mount_set_dax_mode(mp, XFS_DAX_ALWAYS); + return 0; + case Opt_dax_enum: + xfs_mount_set_dax_mode(mp, result.uint_32); return 0; #endif default: @@ -1454,7 +1487,7 @@ xfs_fc_fill_super( if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) sb->s_flags |= SB_I_VERSION; - if (mp->m_flags & XFS_MOUNT_DAX) { + if (mp->m_flags & XFS_MOUNT_DAX_ALWAYS) { bool rtdev_is_dax = false, datadev_is_dax; xfs_warn(mp, @@ -1468,7 +1501,7 @@ xfs_fc_fill_super( if (!rtdev_is_dax && !datadev_is_dax) { xfs_alert(mp, "DAX unsupported by block device. Turning off DAX."); - mp->m_flags &= ~XFS_MOUNT_DAX; + xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER); } if (xfs_sb_version_hasreflink(&mp->m_sb)) { xfs_alert(mp, @@ -1754,7 +1787,6 @@ static int xfs_init_fs_context( INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); spin_lock_init(&mp->m_perag_lock); mutex_init(&mp->m_growlock); - atomic_set(&mp->m_active_trans, 0); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 13fb4b919648..8e88a7ca387e 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -243,8 +243,7 @@ xfs_symlink( */ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); - if (resblks) - resblks -= XFS_IALLOC_SPACE_RES(mp); + resblks -= XFS_IALLOC_SPACE_RES(mp); /* * If the symlink will fit into the inode, write it inline. */ @@ -252,7 +251,7 @@ xfs_symlink( xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen); ip->i_d.di_size = pathlen; - ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; + ip->i_df.if_format = XFS_DINODE_FMT_LOCAL; xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); } else { int offset; @@ -265,8 +264,7 @@ xfs_symlink( if (error) goto out_trans_cancel; - if (resblks) - resblks -= fs_blocks; + resblks -= fs_blocks; ip->i_d.di_size = pathlen; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -386,7 +384,7 @@ xfs_inactive_symlink_rmt( * either 1 or 2 extents and that we can * free them all in one bunmapi call. */ - ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); + ASSERT(ip->i_df.if_nextents > 0 && ip->i_df.if_nextents <= 2); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index a4323a63438d..460136628a79 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1897,8 +1897,8 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->which = which; __entry->ino = ip->i_ino; - __entry->format = ip->i_d.di_format; - __entry->nex = ip->i_d.di_nextents; + __entry->format = ip->i_df.if_format; + __entry->nex = ip->i_df.if_nextents; __entry->broot_size = ip->i_df.if_broot_bytes; __entry->fork_off = XFS_IFORK_BOFF(ip); ), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 28b983ff8b11..3c94e5ff4316 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -68,7 +68,6 @@ xfs_trans_free( xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); trace_xfs_trans_free(tp, _RET_IP_); - atomic_dec(&tp->t_mountp->m_active_trans); if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) sb_end_intwrite(tp->t_mountp->m_super); xfs_trans_free_dqinfo(tp); @@ -125,8 +124,6 @@ xfs_trans_dup( xfs_defer_move(ntp, tp); xfs_trans_dup_dqinfo(tp, ntp); - - atomic_inc(&tp->t_mountp->m_active_trans); return ntp; } @@ -275,7 +272,6 @@ xfs_trans_alloc( */ WARN_ON(resp->tr_logres > 0 && mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); - atomic_inc(&mp->m_active_trans); tp->t_magic = XFS_TRANS_HEADER_MAGIC; tp->t_flags = flags; @@ -299,20 +295,19 @@ xfs_trans_alloc( /* * Create an empty transaction with no reservation. This is a defensive - * mechanism for routines that query metadata without actually modifying - * them -- if the metadata being queried is somehow cross-linked (think a - * btree block pointer that points higher in the tree), we risk deadlock. - * However, blocks grabbed as part of a transaction can be re-grabbed. - * The verifiers will notice the corrupt block and the operation will fail - * back to userspace without deadlocking. + * mechanism for routines that query metadata without actually modifying them -- + * if the metadata being queried is somehow cross-linked (think a btree block + * pointer that points higher in the tree), we risk deadlock. However, blocks + * grabbed as part of a transaction can be re-grabbed. The verifiers will + * notice the corrupt block and the operation will fail back to userspace + * without deadlocking. * - * Note the zero-length reservation; this transaction MUST be cancelled - * without any dirty data. + * Note the zero-length reservation; this transaction MUST be cancelled without + * any dirty data. * - * Callers should obtain freeze protection to avoid two conflicts with fs - * freezing: (1) having active transactions trip the m_active_trans ASSERTs; - * and (2) grabbing buffers at the same time that freeze is trying to drain - * the buffer LRU list. + * Callers should obtain freeze protection to avoid a conflict with fs freezing + * where we can be grabbing buffers at the same time that freeze is trying to + * drain the buffer LRU list. */ int xfs_trans_alloc_empty( @@ -534,57 +529,9 @@ xfs_trans_apply_sb_deltas( sizeof(sbp->sb_frextents) - 1); } -STATIC int -xfs_sb_mod8( - uint8_t *field, - int8_t delta) -{ - int8_t counter = *field; - - counter += delta; - if (counter < 0) { - ASSERT(0); - return -EINVAL; - } - *field = counter; - return 0; -} - -STATIC int -xfs_sb_mod32( - uint32_t *field, - int32_t delta) -{ - int32_t counter = *field; - - counter += delta; - if (counter < 0) { - ASSERT(0); - return -EINVAL; - } - *field = counter; - return 0; -} - -STATIC int -xfs_sb_mod64( - uint64_t *field, - int64_t delta) -{ - int64_t counter = *field; - - counter += delta; - if (counter < 0) { - ASSERT(0); - return -EINVAL; - } - *field = counter; - return 0; -} - /* - * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations - * and apply superblock counter changes to the in-core superblock. The + * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations and + * apply superblock counter changes to the in-core superblock. The * t_res_fdblocks_delta and t_res_frextents_delta fields are explicitly NOT * applied to the in-core superblock. The idea is that that has already been * done. @@ -593,7 +540,12 @@ xfs_sb_mod64( * used block counts are not updated in the on disk superblock. In this case, * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we * still need to update the incore superblock with the changes. + * + * Deltas for the inode count are +/-64, hence we use a large batch size of 128 + * so we don't need to take the counter lock on every update. */ +#define XFS_ICOUNT_BATCH 128 + void xfs_trans_unreserve_and_mod_sb( struct xfs_trans *tp) @@ -629,20 +581,21 @@ xfs_trans_unreserve_and_mod_sb( /* apply the per-cpu counters */ if (blkdelta) { error = xfs_mod_fdblocks(mp, blkdelta, rsvd); - if (error) - goto out; + ASSERT(!error); } if (idelta) { - error = xfs_mod_icount(mp, idelta); - if (error) - goto out_undo_fdblocks; + percpu_counter_add_batch(&mp->m_icount, idelta, + XFS_ICOUNT_BATCH); + if (idelta < 0) + ASSERT(__percpu_counter_compare(&mp->m_icount, 0, + XFS_ICOUNT_BATCH) >= 0); } if (ifreedelta) { - error = xfs_mod_ifree(mp, ifreedelta); - if (error) - goto out_undo_icount; + percpu_counter_add(&mp->m_ifree, ifreedelta); + if (ifreedelta < 0) + ASSERT(percpu_counter_compare(&mp->m_ifree, 0) >= 0); } if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY)) @@ -650,95 +603,23 @@ xfs_trans_unreserve_and_mod_sb( /* apply remaining deltas */ spin_lock(&mp->m_sb_lock); - if (rtxdelta) { - error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta); - if (error) - goto out_undo_ifree; - } - - if (tp->t_dblocks_delta != 0) { - error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta); - if (error) - goto out_undo_frextents; - } - if (tp->t_agcount_delta != 0) { - error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta); - if (error) - goto out_undo_dblocks; - } - if (tp->t_imaxpct_delta != 0) { - error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta); - if (error) - goto out_undo_agcount; - } - if (tp->t_rextsize_delta != 0) { - error = xfs_sb_mod32(&mp->m_sb.sb_rextsize, - tp->t_rextsize_delta); - if (error) - goto out_undo_imaxpct; - } - if (tp->t_rbmblocks_delta != 0) { - error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, - tp->t_rbmblocks_delta); - if (error) - goto out_undo_rextsize; - } - if (tp->t_rblocks_delta != 0) { - error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta); - if (error) - goto out_undo_rbmblocks; - } - if (tp->t_rextents_delta != 0) { - error = xfs_sb_mod64(&mp->m_sb.sb_rextents, - tp->t_rextents_delta); - if (error) - goto out_undo_rblocks; - } - if (tp->t_rextslog_delta != 0) { - error = xfs_sb_mod8(&mp->m_sb.sb_rextslog, - tp->t_rextslog_delta); - if (error) - goto out_undo_rextents; - } + mp->m_sb.sb_frextents += rtxdelta; + mp->m_sb.sb_dblocks += tp->t_dblocks_delta; + mp->m_sb.sb_agcount += tp->t_agcount_delta; + mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta; + mp->m_sb.sb_rextsize += tp->t_rextsize_delta; + mp->m_sb.sb_rbmblocks += tp->t_rbmblocks_delta; + mp->m_sb.sb_rblocks += tp->t_rblocks_delta; + mp->m_sb.sb_rextents += tp->t_rextents_delta; + mp->m_sb.sb_rextslog += tp->t_rextslog_delta; spin_unlock(&mp->m_sb_lock); - return; -out_undo_rextents: - if (tp->t_rextents_delta) - xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta); -out_undo_rblocks: - if (tp->t_rblocks_delta) - xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta); -out_undo_rbmblocks: - if (tp->t_rbmblocks_delta) - xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta); -out_undo_rextsize: - if (tp->t_rextsize_delta) - xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta); -out_undo_imaxpct: - if (tp->t_rextsize_delta) - xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta); -out_undo_agcount: - if (tp->t_agcount_delta) - xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta); -out_undo_dblocks: - if (tp->t_dblocks_delta) - xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta); -out_undo_frextents: - if (rtxdelta) - xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta); -out_undo_ifree: - spin_unlock(&mp->m_sb_lock); - if (ifreedelta) - xfs_mod_ifree(mp, -ifreedelta); -out_undo_icount: - if (idelta) - xfs_mod_icount(mp, -idelta); -out_undo_fdblocks: - if (blkdelta) - xfs_mod_fdblocks(mp, -blkdelta, rsvd); -out: - ASSERT(error == 0); + /* + * Debug checks outside of the spinlock so they don't lock up the + * machine if they fail. + */ + ASSERT(mp->m_sb.sb_imax_pct >= 0); + ASSERT(mp->m_sb.sb_rextslog >= 0); return; } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 752c7fef9de7..8308bf6d7e40 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -59,12 +59,14 @@ struct xfs_log_item { #define XFS_LI_ABORTED 1 #define XFS_LI_FAILED 2 #define XFS_LI_DIRTY 3 /* log item dirty in transaction */ +#define XFS_LI_RECOVERED 4 /* log intent item has been recovered */ #define XFS_LI_FLAGS \ { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \ { (1 << XFS_LI_ABORTED), "ABORTED" }, \ { (1 << XFS_LI_FAILED), "FAILED" }, \ - { (1 << XFS_LI_DIRTY), "DIRTY" } + { (1 << XFS_LI_DIRTY), "DIRTY" }, \ + { (1 << XFS_LI_RECOVERED), "RECOVERED" } struct xfs_item_ops { unsigned flags; @@ -77,6 +79,8 @@ struct xfs_item_ops { void (*iop_release)(struct xfs_log_item *); xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); void (*iop_error)(struct xfs_log_item *, xfs_buf_t *); + int (*iop_recover)(struct xfs_log_item *lip, struct xfs_trans *tp); + bool (*iop_match)(struct xfs_log_item *item, uint64_t id); }; /* diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 564253550b75..ac5019361a13 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -345,6 +345,45 @@ xfs_ail_delete( xfs_trans_ail_cursor_clear(ailp, lip); } +/* + * Requeue a failed buffer for writeback. + * + * We clear the log item failed state here as well, but we have to be careful + * about reference counts because the only active reference counts on the buffer + * may be the failed log items. Hence if we clear the log item failed state + * before queuing the buffer for IO we can release all active references to + * the buffer and free it, leading to use after free problems in + * xfs_buf_delwri_queue. It makes no difference to the buffer or log items which + * order we process them in - the buffer is locked, and we own the buffer list + * so nothing on them is going to change while we are performing this action. + * + * Hence we can safely queue the buffer for IO before we clear the failed log + * item state, therefore always having an active reference to the buffer and + * avoiding the transient zero-reference state that leads to use-after-free. + */ +static inline int +xfsaild_resubmit_item( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + struct xfs_buf *bp = lip->li_buf; + + if (!xfs_buf_trylock(bp)) + return XFS_ITEM_LOCKED; + + if (!xfs_buf_delwri_queue(bp, buffer_list)) { + xfs_buf_unlock(bp); + return XFS_ITEM_FLUSHING; + } + + /* protected by ail_lock */ + list_for_each_entry(lip, &bp->b_li_list, li_bio_list) + xfs_clear_li_failed(lip); + + xfs_buf_unlock(bp); + return XFS_ITEM_SUCCESS; +} + static inline uint xfsaild_push_item( struct xfs_ail *ailp, @@ -365,6 +404,8 @@ xfsaild_push_item( */ if (!lip->li_ops->iop_push) return XFS_ITEM_PINNED; + if (test_bit(XFS_LI_FAILED, &lip->li_flags)) + return xfsaild_resubmit_item(lip, &ailp->ail_buf_list); return lip->li_ops->iop_push(lip, &ailp->ail_buf_list); } @@ -774,6 +815,17 @@ xfs_trans_ail_update_bulk( xfs_ail_update_finish(ailp, tail_lsn); } +/* Insert a log item into the AIL. */ +void +xfs_trans_ail_insert( + struct xfs_ail *ailp, + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + spin_lock(&ailp->ail_lock); + xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); +} + /* * Delete one log item from the AIL. * @@ -800,39 +852,19 @@ xfs_ail_delete_one( return 0; } -/** - * Remove a log items from the AIL - * - * @xfs_trans_ail_delete_bulk takes an array of log items that all need to - * removed from the AIL. The caller is already holding the AIL lock, and done - * all the checks necessary to ensure the items passed in via @log_items are - * ready for deletion. This includes checking that the items are in the AIL. - * - * For each log item to be removed, unlink it from the AIL, clear the IN_AIL - * flag from the item and reset the item's lsn to 0. If we remove the first - * item in the AIL, update the log tail to match the new minimum LSN in the - * AIL. - * - * This function will not drop the AIL lock until all items are removed from - * the AIL to minimise the amount of lock traffic on the AIL. This does not - * greatly increase the AIL hold time, but does significantly reduce the amount - * of traffic on the lock, especially during IO completion. - * - * This function must be called with the AIL lock held. The lock is dropped - * before returning. - */ void xfs_trans_ail_delete( - struct xfs_ail *ailp, struct xfs_log_item *lip, int shutdown_type) { + struct xfs_ail *ailp = lip->li_ailp; struct xfs_mount *mp = ailp->ail_mount; xfs_lsn_t tail_lsn; + spin_lock(&ailp->ail_lock); if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); - if (!XFS_FORCED_SHUTDOWN(mp)) { + if (shutdown_type && !XFS_FORCED_SHUTDOWN(mp)) { xfs_alert_tag(mp, XFS_PTAG_AILDELETE, "%s: attempting to delete a log item that is not in the AIL", __func__); @@ -841,6 +873,7 @@ xfs_trans_ail_delete( return; } + /* xfs_ail_update_finish() drops the AIL lock */ tail_lsn = xfs_ail_delete_one(ailp, lip); xfs_ail_update_finish(ailp, tail_lsn); } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index d1b9869bc5fa..c0f73b82c055 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -388,7 +388,7 @@ xfs_trans_apply_dquot_deltas( */ if (d->d_id) { xfs_qm_adjust_dqlimits(tp->t_mountp, dqp); - xfs_qm_adjust_dqtimers(tp->t_mountp, d); + xfs_qm_adjust_dqtimers(tp->t_mountp, dqp); } dqp->dq_flags |= XFS_DQ_DIRTY; @@ -591,7 +591,7 @@ xfs_trans_dqresv( xfs_dqlock(dqp); - defq = xfs_get_defquota(dqp, q); + defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); if (flags & XFS_TRANS_DQ_RES_BLKS) { hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit); @@ -602,7 +602,7 @@ xfs_trans_dqresv( softlimit = defq->bsoftlimit; timer = be32_to_cpu(dqp->q_core.d_btimer); warns = be16_to_cpu(dqp->q_core.d_bwarns); - warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit; + warnlimit = defq->bwarnlimit; resbcountp = &dqp->q_res_bcount; } else { ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); @@ -614,7 +614,7 @@ xfs_trans_dqresv( softlimit = defq->rtbsoftlimit; timer = be32_to_cpu(dqp->q_core.d_rtbtimer); warns = be16_to_cpu(dqp->q_core.d_rtbwarns); - warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit; + warnlimit = defq->rtbwarnlimit; resbcountp = &dqp->q_res_rtbcount; } @@ -650,7 +650,7 @@ xfs_trans_dqresv( total_count = be64_to_cpu(dqp->q_core.d_icount) + ninos; timer = be32_to_cpu(dqp->q_core.d_itimer); warns = be16_to_cpu(dqp->q_core.d_iwarns); - warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit; + warnlimit = defq->iwarnlimit; hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); if (!hardlimit) hardlimit = defq->ihardlimit; @@ -711,7 +711,7 @@ xfs_trans_dqresv( error_return: xfs_dqunlock(dqp); - if (flags & XFS_QMOPT_ENOSPC) + if (XFS_QM_ISPDQ(dqp)) return -ENOSPC; return -EDQUOT; } @@ -751,8 +751,7 @@ xfs_trans_reserve_quota_bydquots( ASSERT(flags & XFS_QMOPT_RESBLK_MASK); if (udqp) { - error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, - (flags & ~XFS_QMOPT_ENOSPC)); + error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, flags); if (error) return error; } @@ -803,16 +802,12 @@ xfs_trans_reserve_quota_nblks( if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0; - if (XFS_IS_PQUOTA_ON(mp)) - flags |= XFS_QMOPT_ENOSPC; ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == - XFS_TRANS_DQ_RES_RTBLKS || - (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == - XFS_TRANS_DQ_RES_BLKS); + ASSERT((flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_RTBLKS || + (flags & ~(XFS_QMOPT_FORCE_RES)) == XFS_TRANS_DQ_RES_BLKS); /* * Reserve nblks against these dquots, with trans as the mediator. diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 35655eac01a6..3004aeac9110 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -91,26 +91,13 @@ xfs_trans_ail_update( xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); } +void xfs_trans_ail_insert(struct xfs_ail *ailp, struct xfs_log_item *lip, + xfs_lsn_t lsn); + xfs_lsn_t xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn) __releases(ailp->ail_lock); -void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, - int shutdown_type); - -static inline void -xfs_trans_ail_remove( - struct xfs_log_item *lip, - int shutdown_type) -{ - struct xfs_ail *ailp = lip->li_ailp; - - spin_lock(&ailp->ail_lock); - /* xfs_trans_ail_delete() drops the AIL lock */ - if (test_bit(XFS_LI_IN_AIL, &lip->li_flags)) - xfs_trans_ail_delete(ailp, lip, shutdown_type); - else - spin_unlock(&ailp->ail_lock); -} +void xfs_trans_ail_delete(struct xfs_log_item *lip, int shutdown_type); void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); void xfs_ail_push_all(struct xfs_ail *); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index fc5d7276026e..bca48b308c02 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -12,7 +12,6 @@ #include "xfs_inode.h" #include "xfs_attr.h" #include "xfs_acl.h" -#include "xfs_da_format.h" #include "xfs_da_btree.h" #include <linux/posix_acl_xattr.h> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 3ce9829a6936..d79b821ed1c7 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -20,6 +20,7 @@ #include <linux/mman.h> #include <linux/sched/mm.h> #include <linux/crc32.h> +#include <linux/task_io_accounting_ops.h> #include "zonefs.h" @@ -78,10 +79,9 @@ static int zonefs_readpage(struct file *unused, struct page *page) return iomap_readpage(page, &zonefs_iomap_ops); } -static int zonefs_readpages(struct file *unused, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) +static void zonefs_readahead(struct readahead_control *rac) { - return iomap_readpages(mapping, pages, nr_pages, &zonefs_iomap_ops); + iomap_readahead(rac, &zonefs_iomap_ops); } /* @@ -128,7 +128,7 @@ static int zonefs_writepages(struct address_space *mapping, static const struct address_space_operations zonefs_file_aops = { .readpage = zonefs_readpage, - .readpages = zonefs_readpages, + .readahead = zonefs_readahead, .writepage = zonefs_writepage, .writepages = zonefs_writepages, .set_page_dirty = iomap_set_page_dirty, @@ -478,7 +478,7 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) ret = file_write_and_wait_range(file, start, end); if (!ret) - ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL); if (ret) zonefs_io_error(inode, true); @@ -596,6 +596,61 @@ static const struct iomap_dio_ops zonefs_write_dio_ops = { .end_io = zonefs_file_write_dio_end_io, }; +static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct block_device *bdev = inode->i_sb->s_bdev; + unsigned int max; + struct bio *bio; + ssize_t size; + int nr_pages; + ssize_t ret; + + nr_pages = iov_iter_npages(from, BIO_MAX_PAGES); + if (!nr_pages) + return 0; + + max = queue_max_zone_append_sectors(bdev_get_queue(bdev)); + max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); + iov_iter_truncate(from, max); + + bio = bio_alloc_bioset(GFP_NOFS, nr_pages, &fs_bio_set); + if (!bio) + return -ENOMEM; + + bio_set_dev(bio, bdev); + bio->bi_iter.bi_sector = zi->i_zsector; + bio->bi_write_hint = iocb->ki_hint; + bio->bi_ioprio = iocb->ki_ioprio; + bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE; + if (iocb->ki_flags & IOCB_DSYNC) + bio->bi_opf |= REQ_FUA; + + ret = bio_iov_iter_get_pages(bio, from); + if (unlikely(ret)) { + bio_io_error(bio); + return ret; + } + size = bio->bi_iter.bi_size; + task_io_account_write(ret); + + if (iocb->ki_flags & IOCB_HIPRI) + bio_set_polled(bio, iocb); + + ret = submit_bio_wait(bio); + + bio_put(bio); + + zonefs_file_write_dio_end_io(iocb, size, ret, 0); + if (ret >= 0) { + iocb->ki_pos += size; + return size; + } + + return ret; +} + /* * Handle direct writes. For sequential zone files, this is the only possible * write path. For these files, check that the user is issuing writes @@ -611,6 +666,8 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); struct zonefs_inode_info *zi = ZONEFS_I(inode); struct super_block *sb = inode->i_sb; + bool sync = is_sync_kiocb(iocb); + bool append = false; size_t count; ssize_t ret; @@ -619,7 +676,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) * as this can cause write reordering (e.g. the first aio gets EAGAIN * on the inode lock but the second goes through but is now unaligned). */ - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !is_sync_kiocb(iocb) && + if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync && (iocb->ki_flags & IOCB_NOWAIT)) return -EOPNOTSUPP; @@ -643,16 +700,22 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) } /* Enforce sequential writes (append only) in sequential zones */ - mutex_lock(&zi->i_truncate_mutex); - if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && iocb->ki_pos != zi->i_wpoffset) { + if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) { + mutex_lock(&zi->i_truncate_mutex); + if (iocb->ki_pos != zi->i_wpoffset) { + mutex_unlock(&zi->i_truncate_mutex); + ret = -EINVAL; + goto inode_unlock; + } mutex_unlock(&zi->i_truncate_mutex); - ret = -EINVAL; - goto inode_unlock; + append = sync; } - mutex_unlock(&zi->i_truncate_mutex); - ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops, - &zonefs_write_dio_ops, is_sync_kiocb(iocb)); + if (append) + ret = zonefs_file_dio_append(iocb, from); + else + ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops, + &zonefs_write_dio_ops, sync); if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && (ret > 0 || ret == -EIOCBQUEUED)) { if (ret > 0) |