diff options
Diffstat (limited to 'fs')
317 files changed, 18653 insertions, 7721 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig index ac2ec4543fe1..09fd4a185fd2 100644 --- a/fs/9p/Kconfig +++ b/fs/9p/Kconfig @@ -32,13 +32,13 @@ endif config 9P_FS_SECURITY - bool "9P Security Labels" - depends on 9P_FS - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute handler for file security - labels in the 9P filesystem. - - If you are not using a security module that requires using - extended attributes for file security labels, say N. + bool "9P Security Labels" + depends on 9P_FS + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the 9P filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index fe7f0bd2048e..92cd1d80218d 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -388,7 +388,10 @@ v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", iov_iter_count(to), iocb->ki_pos); - ret = p9_client_read(fid, iocb->ki_pos, to, &err); + if (iocb->ki_filp->f_flags & O_NONBLOCK) + ret = p9_client_read_once(fid, iocb->ki_pos, to, &err); + else + ret = p9_client_read(fid, iocb->ki_pos, to, &err); if (!ret) return err; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index b82423a72f68..c9255d399917 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -143,7 +143,7 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses, default: p9_debug(P9_DEBUG_ERROR, "Unknown special type %c %s\n", type, stat->extension); - }; + } *rdev = MKDEV(major, minor); } else res |= S_IFREG; diff --git a/fs/Kconfig b/fs/Kconfig index 708ba336e689..f08fbbfafd9a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -140,9 +140,10 @@ endmenu endif # BLOCK if BLOCK -menu "DOS/FAT/NT Filesystems" +menu "DOS/FAT/EXFAT/NT Filesystems" source "fs/fat/Kconfig" +source "fs/exfat/Kconfig" source "fs/ntfs/Kconfig" endmenu diff --git a/fs/Makefile b/fs/Makefile index 505e51166973..2ce5112b02c8 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -83,6 +83,7 @@ obj-$(CONFIG_HUGETLBFS) += hugetlbfs/ obj-$(CONFIG_CODA_FS) += coda/ obj-$(CONFIG_MINIX_FS) += minix/ obj-$(CONFIG_FAT_FS) += fat/ +obj-$(CONFIG_EXFAT_FS) += exfat/ obj-$(CONFIG_BFS_FS) += bfs/ obj-$(CONFIG_ISO9660_FS) += isofs/ obj-$(CONFIG_HFSPLUS_FS) += hfsplus/ # Before hfs to find wrapped HFS+ diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index a3cdb0036c5d..f3a0f412b43b 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -186,7 +186,7 @@ static int find_autofs_mount(const char *pathname, struct path path; int err; - err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0); + err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path); if (err) return err; err = -ENOENT; @@ -519,8 +519,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, if (!fp || param->ioctlfd == -1) { if (autofs_type_any(type)) - err = kern_path_mountpoint(AT_FDCWD, - name, &path, LOOKUP_FOLLOW); + err = kern_path(name, LOOKUP_FOLLOW | LOOKUP_MOUNTPOINT, + &path); else err = find_autofs_mount(name, &path, test_by_type, &type); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f4713ea76e82..13f25e241ac4 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -27,6 +27,7 @@ #include <linux/highuid.h> #include <linux/compiler.h> #include <linux/highmem.h> +#include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/vmalloc.h> #include <linux/security.h> @@ -698,19 +699,11 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long reloc_func_desc __maybe_unused = 0; int executable_stack = EXSTACK_DEFAULT; struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf; - struct { - struct elfhdr interp_elf_ex; - } *loc; + struct elfhdr *interp_elf_ex = NULL; struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE; struct mm_struct *mm; struct pt_regs *regs; - loc = kmalloc(sizeof(*loc), GFP_KERNEL); - if (!loc) { - retval = -ENOMEM; - goto out_ret; - } - retval = -ENOEXEC; /* First of all, some simple consistency checks */ if (memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0) @@ -770,9 +763,15 @@ static int load_elf_binary(struct linux_binprm *bprm) */ would_dump(bprm, interpreter); + interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL); + if (!interp_elf_ex) { + retval = -ENOMEM; + goto out_free_ph; + } + /* Get the exec headers */ - retval = elf_read(interpreter, &loc->interp_elf_ex, - sizeof(loc->interp_elf_ex), 0); + retval = elf_read(interpreter, interp_elf_ex, + sizeof(*interp_elf_ex), 0); if (retval < 0) goto out_free_dentry; @@ -806,25 +805,25 @@ out_free_interp: if (interpreter) { retval = -ELIBBAD; /* Not an ELF interpreter */ - if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0) + if (memcmp(interp_elf_ex->e_ident, ELFMAG, SELFMAG) != 0) goto out_free_dentry; /* Verify the interpreter has a valid arch */ - if (!elf_check_arch(&loc->interp_elf_ex) || - elf_check_fdpic(&loc->interp_elf_ex)) + if (!elf_check_arch(interp_elf_ex) || + elf_check_fdpic(interp_elf_ex)) goto out_free_dentry; /* Load the interpreter program headers */ - interp_elf_phdata = load_elf_phdrs(&loc->interp_elf_ex, + interp_elf_phdata = load_elf_phdrs(interp_elf_ex, interpreter); if (!interp_elf_phdata) goto out_free_dentry; /* Pass PT_LOPROC..PT_HIPROC headers to arch code */ elf_ppnt = interp_elf_phdata; - for (i = 0; i < loc->interp_elf_ex.e_phnum; i++, elf_ppnt++) + for (i = 0; i < interp_elf_ex->e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { case PT_LOPROC ... PT_HIPROC: - retval = arch_elf_pt_proc(&loc->interp_elf_ex, + retval = arch_elf_pt_proc(interp_elf_ex, elf_ppnt, interpreter, true, &arch_state); if (retval) @@ -839,7 +838,7 @@ out_free_interp: * the exec syscall. */ retval = arch_check_elf(elf_ex, - !!interpreter, &loc->interp_elf_ex, + !!interpreter, interp_elf_ex, &arch_state); if (retval) goto out_free_dentry; @@ -1055,7 +1054,7 @@ out_free_interp: } if (interpreter) { - elf_entry = load_elf_interp(&loc->interp_elf_ex, + elf_entry = load_elf_interp(interp_elf_ex, interpreter, load_bias, interp_elf_phdata); if (!IS_ERR((void *)elf_entry)) { @@ -1064,7 +1063,7 @@ out_free_interp: * adjustment */ interp_load_addr = elf_entry; - elf_entry += loc->interp_elf_ex.e_entry; + elf_entry += interp_elf_ex->e_entry; } if (BAD_ADDR(elf_entry)) { retval = IS_ERR((void *)elf_entry) ? @@ -1075,6 +1074,9 @@ out_free_interp: allow_write_access(interpreter); fput(interpreter); + + kfree(interp_elf_ex); + kfree(interp_elf_phdata); } else { elf_entry = e_entry; if (BAD_ADDR(elf_entry)) { @@ -1083,7 +1085,6 @@ out_free_interp: } } - kfree(interp_elf_phdata); kfree(elf_phdata); set_binfmt(&elf_format); @@ -1153,12 +1154,11 @@ out_free_interp: start_thread(regs, elf_entry, bprm->p); retval = 0; out: - kfree(loc); -out_ret: return retval; /* error cleanup */ out_free_dentry: + kfree(interp_elf_ex); kfree(interp_elf_phdata); allow_write_access(interpreter); if (interpreter) @@ -1317,7 +1317,7 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, } /* Hugetlb memory check */ - if (vma->vm_flags & VM_HUGETLB) { + if (is_vm_hugetlb_page(vma)) { if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) goto whole; if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) diff --git a/fs/block_dev.c b/fs/block_dev.c index 9501880dff5e..52b6f646cdbd 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -34,6 +34,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/falloc.h> #include <linux/uaccess.h> +#include <linux/suspend.h> #include "internal.h" struct bdev_inode { @@ -2013,7 +2014,8 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if (bdev_read_only(I_BDEV(bd_inode))) return -EPERM; - if (IS_SWAPFILE(bd_inode)) + /* uswsusp needs write permission to the swap */ + if (IS_SWAPFILE(bd_inode) && !hibernation_available()) return -ETXTBSY; if (!iov_iter_count(from)) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 7ab616601141..6f4678d98df7 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -159,8 +159,6 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, if (!PagePrivate(page)) return; - ClearPageChecked(page); - dout("%p invalidatepage %p idx %lu full dirty page\n", inode, page, page->index); @@ -183,6 +181,47 @@ static int ceph_releasepage(struct page *page, gfp_t g) } /* + * Read some contiguous pages. If we cross a stripe boundary, shorten + * *plen. Return number of bytes read, or error. + */ +static int ceph_sync_readpages(struct ceph_fs_client *fsc, + struct ceph_vino vino, + struct ceph_file_layout *layout, + u64 off, u64 *plen, + u32 truncate_seq, u64 truncate_size, + struct page **pages, int num_pages, + int page_align) +{ + struct ceph_osd_client *osdc = &fsc->client->osdc; + struct ceph_osd_request *req; + int rc = 0; + + dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, + vino.snap, off, *plen); + req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1, + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, + NULL, truncate_seq, truncate_size, + false); + if (IS_ERR(req)) + return PTR_ERR(req); + + /* it may be a short read due to an object boundary */ + osd_req_op_extent_osd_data_pages(req, 0, + pages, *plen, page_align, false, false); + + dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", + off, *plen, *plen, page_align); + + rc = ceph_osdc_start_request(osdc, req, false); + if (!rc) + rc = ceph_osdc_wait_request(osdc, req); + + ceph_osdc_put_request(req); + dout("readpages result %d\n", rc); + return rc; +} + +/* * read a single page, without unlocking it. */ static int ceph_do_readpage(struct file *filp, struct page *page) @@ -218,7 +257,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page) dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); - err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), + err = ceph_sync_readpages(fsc, ceph_vino(inode), &ci->i_layout, off, &len, ci->i_truncate_seq, ci->i_truncate_size, &page, 1, 0); @@ -571,6 +610,47 @@ static u64 get_writepages_data_length(struct inode *inode, } /* + * do a synchronous write on N pages + */ +static int ceph_sync_writepages(struct ceph_fs_client *fsc, + struct ceph_vino vino, + struct ceph_file_layout *layout, + struct ceph_snap_context *snapc, + u64 off, u64 len, + u32 truncate_seq, u64 truncate_size, + struct timespec64 *mtime, + struct page **pages, int num_pages) +{ + struct ceph_osd_client *osdc = &fsc->client->osdc; + struct ceph_osd_request *req; + int rc = 0; + int page_align = off & ~PAGE_MASK; + + req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, + snapc, truncate_seq, truncate_size, + true); + if (IS_ERR(req)) + return PTR_ERR(req); + + /* it may be a short write due to an object boundary */ + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, + false, false); + dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); + + req->r_mtime = *mtime; + rc = ceph_osdc_start_request(osdc, req, true); + if (!rc) + rc = ceph_osdc_wait_request(osdc, req); + + ceph_osdc_put_request(req); + if (rc == 0) + rc = len; + dout("writepages result %d\n", rc); + return rc; +} + +/* * Write a single page, but leave the page locked. * * If we get a write error, mark the mapping for error, but still adjust the @@ -628,7 +708,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); set_page_writeback(page); - err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode), + err = ceph_sync_writepages(fsc, ceph_vino(inode), &ci->i_layout, snapc, page_off, len, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, @@ -1575,7 +1655,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) do { lock_page(page); - if ((off > size) || (page->mapping != inode->i_mapping)) { + if (page_mkwrite_check_truncate(page, inode) < 0) { unlock_page(page); ret = VM_FAULT_NOPAGE; break; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 270b769607a2..2f5cb6bc78e1 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -32,7 +32,7 @@ struct ceph_fscache_entry { size_t uniq_len; /* The following members must be last */ struct ceph_fsid fsid; - char uniquifier[0]; + char uniquifier[]; }; static const struct fscache_cookie_def ceph_fscache_fsid_object_def = { diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 28ae0c134700..185db76300b3 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -490,13 +490,10 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { struct ceph_mount_options *opt = mdsc->fsc->mount_options; - - ci->i_hold_caps_min = round_jiffies(jiffies + - opt->caps_wanted_delay_min * HZ); ci->i_hold_caps_max = round_jiffies(jiffies + opt->caps_wanted_delay_max * HZ); - dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode, - ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies); + dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode, + ci->i_hold_caps_max - jiffies); } /* @@ -508,10 +505,9 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, * -> we take mdsc->cap_delay_lock */ static void __cap_delay_requeue(struct ceph_mds_client *mdsc, - struct ceph_inode_info *ci, - bool set_timeout) + struct ceph_inode_info *ci) { - dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode, + dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode, ci->i_ceph_flags, ci->i_hold_caps_max); if (!mdsc->stopping) { spin_lock(&mdsc->cap_delay_lock); @@ -520,8 +516,7 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc, goto no_change; list_del_init(&ci->i_cap_delay_list); } - if (set_timeout) - __cap_set_timeouts(mdsc, ci); + __cap_set_timeouts(mdsc, ci); list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); no_change: spin_unlock(&mdsc->cap_delay_lock); @@ -561,19 +556,20 @@ static void __cap_delay_cancel(struct ceph_mds_client *mdsc, spin_unlock(&mdsc->cap_delay_lock); } -/* - * Common issue checks for add_cap, handle_cap_grant. - */ +/* Common issue checks for add_cap, handle_cap_grant. */ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, unsigned issued) { unsigned had = __ceph_caps_issued(ci, NULL); + lockdep_assert_held(&ci->i_ceph_lock); + /* * Each time we receive FILE_CACHE anew, we increment * i_rdcache_gen. */ - if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + if (S_ISREG(ci->vfs_inode.i_mode) && + (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { ci->i_rdcache_gen++; } @@ -592,6 +588,13 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, __ceph_dir_clear_complete(ci); } } + + /* Wipe saved layout if we're losing DIR_CREATE caps */ + if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && + !(issued & CEPH_CAP_DIR_CREATE)) { + ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); + memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); + } } /* @@ -605,7 +608,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, */ void ceph_add_cap(struct inode *inode, struct ceph_mds_session *session, u64 cap_id, - int fmode, unsigned issued, unsigned wanted, + unsigned issued, unsigned wanted, unsigned seq, unsigned mseq, u64 realmino, int flags, struct ceph_cap **new_cap) { @@ -621,13 +624,6 @@ void ceph_add_cap(struct inode *inode, dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, session->s_mds, cap_id, ceph_cap_string(issued), seq); - /* - * If we are opening the file, include file mode wanted bits - * in wanted. - */ - if (fmode >= 0) - wanted |= ceph_caps_for_mode(fmode); - spin_lock(&session->s_gen_ttl_lock); gen = session->s_cap_gen; spin_unlock(&session->s_gen_ttl_lock); @@ -725,7 +721,7 @@ void ceph_add_cap(struct inode *inode, dout(" issued %s, mds wanted %s, actual %s, queueing\n", ceph_cap_string(issued), ceph_cap_string(wanted), ceph_cap_string(actual_wanted)); - __cap_delay_requeue(mdsc, ci, true); + __cap_delay_requeue(mdsc, ci); } if (flags & CEPH_CAP_FLAG_AUTH) { @@ -752,9 +748,6 @@ void ceph_add_cap(struct inode *inode, cap->issue_seq = seq; cap->mseq = mseq; cap->cap_gen = gen; - - if (fmode >= 0) - __ceph_get_fmode(ci, fmode); } /* @@ -958,29 +951,97 @@ int __ceph_caps_used(struct ceph_inode_info *ci) if (ci->i_rd_ref) used |= CEPH_CAP_FILE_RD; if (ci->i_rdcache_ref || - (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */ + (S_ISREG(ci->vfs_inode.i_mode) && ci->vfs_inode.i_data.nrpages)) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; if (ci->i_wb_ref || ci->i_wrbuffer_ref) used |= CEPH_CAP_FILE_BUFFER; + if (ci->i_fx_ref) + used |= CEPH_CAP_FILE_EXCL; return used; } +#define FMODE_WAIT_BIAS 1000 + /* * wanted, by virtue of open file modes */ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) { - int i, bits = 0; - for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { - if (ci->i_nr_by_mode[i]) - bits |= 1 << i; + const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN); + const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD); + const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR); + const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY); + struct ceph_mount_options *opt = + ceph_inode_to_client(&ci->vfs_inode)->mount_options; + unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ; + unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ; + + if (S_ISDIR(ci->vfs_inode.i_mode)) { + int want = 0; + + /* use used_cutoff here, to keep dir's wanted caps longer */ + if (ci->i_nr_by_mode[RD_SHIFT] > 0 || + time_after(ci->i_last_rd, used_cutoff)) + want |= CEPH_CAP_ANY_SHARED; + + if (ci->i_nr_by_mode[WR_SHIFT] > 0 || + time_after(ci->i_last_wr, used_cutoff)) { + want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; + if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) + want |= CEPH_CAP_ANY_DIR_OPS; + } + + if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0) + want |= CEPH_CAP_PIN; + + return want; + } else { + int bits = 0; + + if (ci->i_nr_by_mode[RD_SHIFT] > 0) { + if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS || + time_after(ci->i_last_rd, used_cutoff)) + bits |= 1 << RD_SHIFT; + } else if (time_after(ci->i_last_rd, idle_cutoff)) { + bits |= 1 << RD_SHIFT; + } + + if (ci->i_nr_by_mode[WR_SHIFT] > 0) { + if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS || + time_after(ci->i_last_wr, used_cutoff)) + bits |= 1 << WR_SHIFT; + } else if (time_after(ci->i_last_wr, idle_cutoff)) { + bits |= 1 << WR_SHIFT; + } + + /* check lazyio only when read/write is wanted */ + if ((bits & (CEPH_FILE_MODE_RDWR << 1)) && + ci->i_nr_by_mode[LAZY_SHIFT] > 0) + bits |= 1 << LAZY_SHIFT; + + return bits ? ceph_caps_for_mode(bits >> 1) : 0; } - if (bits == 0) - return 0; - return ceph_caps_for_mode(bits >> 1); +} + +/* + * wanted, by virtue of open file modes AND cap refs (buffered/cached data) + */ +int __ceph_caps_wanted(struct ceph_inode_info *ci) +{ + int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); + if (S_ISDIR(ci->vfs_inode.i_mode)) { + /* we want EXCL if holding caps of dir ops */ + if (w & CEPH_CAP_ANY_DIR_OPS) + w |= CEPH_CAP_FILE_EXCL; + } else { + /* we want EXCL if dirty data */ + if (w & CEPH_CAP_FILE_BUFFER) + w |= CEPH_CAP_FILE_EXCL; + } + return w; } /* @@ -1004,14 +1065,6 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check) return mds_wanted; } -/* - * called under i_ceph_lock - */ -static int __ceph_is_single_caps(struct ceph_inode_info *ci) -{ - return rb_first(&ci->i_caps) == rb_last(&ci->i_caps); -} - int ceph_is_any_caps(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); @@ -1274,9 +1327,15 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, struct cap_msg_args arg; int held, revoking; int wake = 0; - int delayed = 0; int ret; + /* Don't send anything if it's still being created. Return delayed */ + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { + spin_unlock(&ci->i_ceph_lock); + dout("%s async create in flight for %p\n", __func__, inode); + return 1; + } + held = cap->issued | cap->implemented; revoking = cap->implemented & ~cap->issued; retain &= ~revoking; @@ -1287,28 +1346,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, ceph_cap_string(revoking)); BUG_ON((retain & CEPH_CAP_PIN) == 0); - arg.session = cap->session; - - /* don't release wanted unless we've waited a bit. */ - if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && - time_before(jiffies, ci->i_hold_caps_min)) { - dout(" delaying issued %s -> %s, wanted %s -> %s on send\n", - ceph_cap_string(cap->issued), - ceph_cap_string(cap->issued & retain), - ceph_cap_string(cap->mds_wanted), - ceph_cap_string(want)); - want |= cap->mds_wanted; - retain |= cap->issued; - delayed = 1; - } - ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH); - if (want & ~cap->mds_wanted) { - /* user space may open/close single file frequently. - * This avoids droping mds_wanted immediately after - * requesting new mds_wanted. - */ - __cap_set_timeouts(mdsc, ci); - } + ci->i_ceph_flags &= ~CEPH_I_FLUSH; cap->issued &= retain; /* drop bits we don't want */ if (cap->implemented & ~cap->issued) { @@ -1323,6 +1361,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, cap->implemented &= cap->issued | used; cap->mds_wanted = want; + arg.session = cap->session; arg.ino = ceph_vino(inode).ino; arg.cid = cap->cap_id; arg.follows = flushing ? ci->i_head_snapc->seq : 0; @@ -1332,7 +1371,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, arg.size = inode->i_size; ci->i_reported_size = arg.size; arg.max_size = ci->i_wanted_max_size; - ci->i_requested_max_size = arg.max_size; + if (cap == ci->i_auth_cap) + ci->i_requested_max_size = arg.max_size; if (flushing & CEPH_CAP_XATTR_EXCL) { old_blob = __ceph_build_xattrs_blob(ci); @@ -1383,14 +1423,19 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, ret = send_cap_msg(&arg); if (ret < 0) { - dout("error sending cap msg, must requeue %p\n", inode); - delayed = 1; + pr_err("error sending cap msg, ino (%llx.%llx) " + "flushing %s tid %llu, requeue\n", + ceph_vinop(inode), ceph_cap_string(flushing), + flush_tid); + spin_lock(&ci->i_ceph_lock); + __cap_delay_requeue(mdsc, ci); + spin_unlock(&ci->i_ceph_lock); } if (wake) wake_up_all(&ci->i_cap_wq); - return delayed; + return ret; } static inline int __send_flush_snap(struct inode *inode, @@ -1617,6 +1662,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, int was = ci->i_dirty_caps; int dirty = 0; + lockdep_assert_held(&ci->i_ceph_lock); + if (!ci->i_auth_cap) { pr_warn("__mark_dirty_caps %p %llx mask %s, " "but no auth cap (session was closed?)\n", @@ -1654,7 +1701,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && (mask & CEPH_CAP_FILE_BUFFER)) dirty |= I_DIRTY_DATASYNC; - __cap_delay_requeue(mdsc, ci, true); + __cap_delay_requeue(mdsc, ci); return dirty; } @@ -1726,6 +1773,7 @@ static u64 __mark_caps_flushing(struct inode *inode, struct ceph_cap_flush *cf = NULL; int flushing; + lockdep_assert_held(&ci->i_ceph_lock); BUG_ON(ci->i_dirty_caps == 0); BUG_ON(list_empty(&ci->i_dirty_item)); BUG_ON(!ci->i_prealloc_cap_flush); @@ -1805,8 +1853,6 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci) * versus held caps. Release, flush, ack revoked caps to mds as * appropriate. * - * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay - * cap release further. * CHECK_CAPS_AUTHONLY - we should only check the auth cap * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without * further delay. @@ -1825,24 +1871,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, int mds = -1; /* keep track of how far we've gone through i_caps list to avoid an infinite loop on retry */ struct rb_node *p; - int delayed = 0, sent = 0; - bool no_delay = flags & CHECK_CAPS_NODELAY; bool queue_invalidate = false; bool tried_invalidate = false; - /* if we are unmounting, flush any unused caps immediately. */ - if (mdsc->stopping) - no_delay = true; - spin_lock(&ci->i_ceph_lock); - if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; - if (!(flags & CHECK_CAPS_AUTHONLY) || - (ci->i_auth_cap && __ceph_is_single_caps(ci))) - __cap_delay_cancel(mdsc, ci); - goto retry_locked; retry: spin_lock(&ci->i_ceph_lock); @@ -1866,10 +1901,11 @@ retry_locked: * revoking the shared cap on every create/unlink * operation. */ - if (IS_RDONLY(inode)) + if (IS_RDONLY(inode)) { want = CEPH_CAP_ANY_SHARED; - else - want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; + } else { + want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; + } retain |= want; } else { @@ -1885,14 +1921,13 @@ retry_locked: } dout("check_caps %p file_want %s used %s dirty %s flushing %s" - " issued %s revoking %s retain %s %s%s%s\n", inode, + " issued %s revoking %s retain %s %s%s\n", inode, ceph_cap_string(file_wanted), ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(issued), ceph_cap_string(revoking), ceph_cap_string(retain), (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", - (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "", (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : ""); /* @@ -1900,8 +1935,8 @@ retry_locked: * have cached pages, but don't want them, then try to invalidate. * If we fail, it's because pages are locked.... try again later. */ - if ((!no_delay || mdsc->stopping) && - !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ + if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) && + S_ISREG(inode->i_mode) && !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */ inode->i_data.nrpages && /* have cached pages */ (revoking & (CEPH_CAP_FILE_CACHE| @@ -1973,28 +2008,17 @@ retry_locked: } /* want more caps from mds? */ - if (want & ~(cap->mds_wanted | cap->issued)) - goto ack; + if (want & ~cap->mds_wanted) { + if (want & ~(cap->mds_wanted | cap->issued)) + goto ack; + if (!__cap_is_valid(cap)) + goto ack; + } /* things we might delay */ if ((cap->issued & ~retain) == 0) continue; /* nope, all good */ - if (no_delay) - goto ack; - - /* delay? */ - if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && - time_before(jiffies, ci->i_hold_caps_max)) { - dout(" delaying issued %s -> %s, wanted %s -> %s\n", - ceph_cap_string(cap->issued), - ceph_cap_string(cap->issued & retain), - ceph_cap_string(cap->mds_wanted), - ceph_cap_string(want)); - delayed++; - continue; - } - ack: if (session && session != cap->session) { dout("oops, wrong session %p mutex\n", session); @@ -2055,18 +2079,20 @@ ack: } mds = cap->mds; /* remember mds, so we don't repeat */ - sent++; /* __send_cap drops i_ceph_lock */ - delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0, - cap_used, want, retain, flushing, - flush_tid, oldest_flush_tid); + __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want, + retain, flushing, flush_tid, oldest_flush_tid); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } - /* Reschedule delayed caps release if we delayed anything */ - if (delayed) - __cap_delay_requeue(mdsc, ci, false); + /* periodically re-calculate caps wanted by open files */ + if (__ceph_is_any_real_caps(ci) && + list_empty(&ci->i_cap_delay_list) && + (file_wanted & ~CEPH_CAP_PIN) && + !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + __cap_delay_requeue(mdsc, ci); + } spin_unlock(&ci->i_ceph_lock); @@ -2095,7 +2121,6 @@ retry: retry_locked: if (ci->i_dirty_caps && ci->i_auth_cap) { struct ceph_cap *cap = ci->i_auth_cap; - int delayed; if (session != cap->session) { spin_unlock(&ci->i_ceph_lock); @@ -2124,18 +2149,10 @@ retry_locked: &oldest_flush_tid); /* __send_cap drops i_ceph_lock */ - delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, - CEPH_CLIENT_CAPS_SYNC, - __ceph_caps_used(ci), - __ceph_caps_wanted(ci), - (cap->issued | cap->implemented), - flushing, flush_tid, oldest_flush_tid); - - if (delayed) { - spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci, true); - spin_unlock(&ci->i_ceph_lock); - } + __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC, + __ceph_caps_used(ci), __ceph_caps_wanted(ci), + (cap->issued | cap->implemented), + flushing, flush_tid, oldest_flush_tid); } else { if (!list_empty(&ci->i_cap_flush_list)) { struct ceph_cap_flush *cf = @@ -2233,6 +2250,10 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (datasync) goto out; + ret = ceph_wait_on_async_create(inode); + if (ret) + goto out; + dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); @@ -2335,22 +2356,13 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, if (cf->caps) { dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode, cap, cf->tid, ceph_cap_string(cf->caps)); - ci->i_ceph_flags |= CEPH_I_NODELAY; - - ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, (cf->tid < last_snap_flush ? CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0), __ceph_caps_used(ci), __ceph_caps_wanted(ci), (cap->issued | cap->implemented), cf->caps, cf->tid, oldest_flush_tid); - if (ret) { - pr_err("kick_flushing_caps: error sending " - "cap flush, ino (%llx.%llx) " - "tid %llu flushing %s\n", - ceph_vinop(inode), cf->tid, - ceph_cap_string(cf->caps)); - } } else { struct ceph_cap_snap *capsnap = container_of(cf, struct ceph_cap_snap, @@ -2457,16 +2469,15 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, } } -static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session, - struct inode *inode) - __releases(ci->i_ceph_lock) +void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, + struct ceph_inode_info *ci) { - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_cap *cap; + struct ceph_mds_client *mdsc = session->s_mdsc; + struct ceph_cap *cap = ci->i_auth_cap; + + lockdep_assert_held(&ci->i_ceph_lock); - cap = ci->i_auth_cap; - dout("kick_flushing_inode_caps %p flushing %s\n", inode, + dout("%s %p flushing %s\n", __func__, &ci->vfs_inode, ceph_cap_string(ci->i_flushing_caps)); if (!list_empty(&ci->i_cap_flush_list)) { @@ -2478,9 +2489,6 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, spin_unlock(&mdsc->cap_dirty_lock); __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); - spin_unlock(&ci->i_ceph_lock); - } else { - spin_unlock(&ci->i_ceph_lock); } } @@ -2488,18 +2496,20 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, /* * Take references to capabilities we hold, so that we don't release * them to the MDS prematurely. - * - * Protected by i_ceph_lock. */ -static void __take_cap_refs(struct ceph_inode_info *ci, int got, +void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, bool snap_rwsem_locked) { + lockdep_assert_held(&ci->i_ceph_lock); + if (got & CEPH_CAP_PIN) ci->i_pin_ref++; if (got & CEPH_CAP_FILE_RD) ci->i_rd_ref++; if (got & CEPH_CAP_FILE_CACHE) ci->i_rdcache_ref++; + if (got & CEPH_CAP_FILE_EXCL) + ci->i_fx_ref++; if (got & CEPH_CAP_FILE_WR) { if (ci->i_wr_ref == 0 && !ci->i_head_snapc) { BUG_ON(!snap_rwsem_locked); @@ -2512,7 +2522,7 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got, if (ci->i_wb_ref == 0) ihold(&ci->vfs_inode); ci->i_wb_ref++; - dout("__take_cap_refs %p wb %d -> %d (?)\n", + dout("%s %p wb %d -> %d (?)\n", __func__, &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref); } } @@ -2524,14 +2534,16 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got, * Note that caller is responsible for ensuring max_size increases are * requested from the MDS. * - * Returns 0 if caps were not able to be acquired (yet), a 1 if they were, - * or a negative error code. - * - * FIXME: how does a 0 return differ from -EAGAIN? + * Returns 0 if caps were not able to be acquired (yet), 1 if succeed, + * or a negative error code. There are 3 speical error codes: + * -EAGAIN: need to sleep but non-blocking is specified + * -EFBIG: ask caller to call check_max_size() and try again. + * -ESTALE: ask caller to call ceph_renew_caps() and try again. */ enum { - NON_BLOCKING = 1, - CHECK_FILELOCK = 2, + /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */ + NON_BLOCKING = (1 << 8), + CHECK_FILELOCK = (1 << 9), }; static int try_get_cap_refs(struct inode *inode, int need, int want, @@ -2541,7 +2553,6 @@ static int try_get_cap_refs(struct inode *inode, int need, int want, struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; int ret = 0; int have, implemented; - int file_wanted; bool snap_rwsem_locked = false; dout("get_cap_refs %p need %s want %s\n", inode, @@ -2557,15 +2568,6 @@ again: goto out_unlock; } - /* make sure file is actually open */ - file_wanted = __ceph_caps_file_wanted(ci); - if ((file_wanted & need) != need) { - dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", - ceph_cap_string(need), ceph_cap_string(file_wanted)); - ret = -EBADF; - goto out_unlock; - } - /* finish pending truncate */ while (ci->i_truncate_pending) { spin_unlock(&ci->i_ceph_lock); @@ -2584,7 +2586,7 @@ again: dout("get_cap_refs %p endoff %llu > maxsize %llu\n", inode, endoff, ci->i_max_size); if (endoff > ci->i_requested_max_size) - ret = -EAGAIN; + ret = ci->i_auth_cap ? -EFBIG : -ESTALE; goto out_unlock; } /* @@ -2630,51 +2632,55 @@ again: } snap_rwsem_locked = true; } - *got = need | (have & want); - if ((need & CEPH_CAP_FILE_RD) && + if ((have & want) == want) + *got = need | want; + else + *got = need; + if (S_ISREG(inode->i_mode) && + (need & CEPH_CAP_FILE_RD) && !(*got & CEPH_CAP_FILE_CACHE)) ceph_disable_fscache_readpage(ci); - __take_cap_refs(ci, *got, true); + ceph_take_cap_refs(ci, *got, true); ret = 1; } } else { int session_readonly = false; - if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) { + int mds_wanted; + if (ci->i_auth_cap && + (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) { struct ceph_mds_session *s = ci->i_auth_cap->session; spin_lock(&s->s_cap_lock); session_readonly = s->s_readonly; spin_unlock(&s->s_cap_lock); } if (session_readonly) { - dout("get_cap_refs %p needed %s but mds%d readonly\n", + dout("get_cap_refs %p need %s but mds%d readonly\n", inode, ceph_cap_string(need), ci->i_auth_cap->mds); ret = -EROFS; goto out_unlock; } - if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) { - int mds_wanted; - if (READ_ONCE(mdsc->fsc->mount_state) == - CEPH_MOUNT_SHUTDOWN) { - dout("get_cap_refs %p forced umount\n", inode); - ret = -EIO; - goto out_unlock; - } - mds_wanted = __ceph_caps_mds_wanted(ci, false); - if (need & ~(mds_wanted & need)) { - dout("get_cap_refs %p caps were dropped" - " (session killed?)\n", inode); - ret = -ESTALE; - goto out_unlock; - } - if (!(file_wanted & ~mds_wanted)) - ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED; + if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + dout("get_cap_refs %p forced umount\n", inode); + ret = -EIO; + goto out_unlock; + } + mds_wanted = __ceph_caps_mds_wanted(ci, false); + if (need & ~mds_wanted) { + dout("get_cap_refs %p need %s > mds_wanted %s\n", + inode, ceph_cap_string(need), + ceph_cap_string(mds_wanted)); + ret = -ESTALE; + goto out_unlock; } - dout("get_cap_refs %p have %s needed %s\n", inode, + dout("get_cap_refs %p have %s need %s\n", inode, ceph_cap_string(have), ceph_cap_string(need)); } out_unlock: + + __ceph_touch_fmode(ci, mdsc, flags); + spin_unlock(&ci->i_ceph_lock); if (snap_rwsem_locked) up_read(&mdsc->snap_rwsem); @@ -2712,20 +2718,40 @@ static void check_max_size(struct inode *inode, loff_t endoff) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); } +static inline int get_used_fmode(int caps) +{ + int fmode = 0; + if (caps & CEPH_CAP_FILE_RD) + fmode |= CEPH_FILE_MODE_RD; + if (caps & CEPH_CAP_FILE_WR) + fmode |= CEPH_FILE_MODE_WR; + return fmode; +} + int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got) { - int ret; + int ret, flags; BUG_ON(need & ~CEPH_CAP_FILE_RD); - BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); - ret = ceph_pool_perm_check(inode, need); - if (ret < 0) - return ret; + BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO | + CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | + CEPH_CAP_ANY_DIR_OPS)); + if (need) { + ret = ceph_pool_perm_check(inode, need); + if (ret < 0) + return ret; + } + + flags = get_used_fmode(need | want); + if (nonblock) + flags |= NON_BLOCKING; - ret = try_get_cap_refs(inode, need, want, 0, - (nonblock ? NON_BLOCKING : 0), got); - return ret == -EAGAIN ? 0 : ret; + ret = try_get_cap_refs(inode, need, want, 0, flags, got); + /* three special error codes */ + if (ret == -EAGAIN || ret == -EFBIG || ret == -EAGAIN) + ret = 0; + return ret; } /* @@ -2750,16 +2776,16 @@ int ceph_get_caps(struct file *filp, int need, int want, fi->filp_gen != READ_ONCE(fsc->filp_gen)) return -EBADF; - while (true) { - if (endoff > 0) - check_max_size(inode, endoff); + flags = get_used_fmode(need | want); - flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0; + while (true) { + flags &= CEPH_FILE_MODE_MASK; + if (atomic_read(&fi->num_locks)) + flags |= CHECK_FILELOCK; _got = 0; ret = try_get_cap_refs(inode, need, want, endoff, flags, &_got); - if (ret == -EAGAIN) - continue; + WARN_ON_ONCE(ret == -EAGAIN); if (!ret) { struct ceph_mds_client *mdsc = fsc->mdsc; struct cap_wait cw; @@ -2774,6 +2800,8 @@ int ceph_get_caps(struct file *filp, int need, int want, list_add(&cw.list, &mdsc->cap_wait_list); spin_unlock(&mdsc->caps_list_lock); + /* make sure used fmode not timeout */ + ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS); add_wait_queue(&ci->i_cap_wq, &wait); flags |= NON_BLOCKING; @@ -2787,6 +2815,7 @@ int ceph_get_caps(struct file *filp, int need, int want, } remove_wait_queue(&ci->i_cap_wq, &wait); + ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS); spin_lock(&mdsc->caps_list_lock); list_del(&cw.list); @@ -2804,16 +2833,26 @@ int ceph_get_caps(struct file *filp, int need, int want, } if (ret < 0) { + if (ret == -EFBIG || ret == -ESTALE) { + int ret2 = ceph_wait_on_async_create(inode); + if (ret2 < 0) + return ret2; + } + if (ret == -EFBIG) { + check_max_size(inode, endoff); + continue; + } if (ret == -ESTALE) { /* session was killed, try renew caps */ - ret = ceph_renew_caps(inode); + ret = ceph_renew_caps(inode, flags); if (ret == 0) continue; } return ret; } - if (ci->i_inline_version != CEPH_INLINE_NONE && + if (S_ISREG(ci->vfs_inode.i_mode) && + ci->i_inline_version != CEPH_INLINE_NONE && (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && i_size_read(inode) > 0) { struct page *page = @@ -2846,7 +2885,8 @@ int ceph_get_caps(struct file *filp, int need, int want, break; } - if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) + if (S_ISREG(ci->vfs_inode.i_mode) && + (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) ceph_fscache_revalidate_cookie(ci); *got = _got; @@ -2860,7 +2900,7 @@ int ceph_get_caps(struct file *filp, int need, int want, void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) { spin_lock(&ci->i_ceph_lock); - __take_cap_refs(ci, caps, false); + ceph_take_cap_refs(ci, caps, false); spin_unlock(&ci->i_ceph_lock); } @@ -2911,6 +2951,9 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) if (had & CEPH_CAP_FILE_CACHE) if (--ci->i_rdcache_ref == 0) last++; + if (had & CEPH_CAP_FILE_EXCL) + if (--ci->i_fx_ref == 0) + last++; if (had & CEPH_CAP_FILE_BUFFER) { if (--ci->i_wb_ref == 0) { last++; @@ -2950,7 +2993,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), last ? " last" : "", put ? " put" : ""); - if (last && !flushsnaps) + if (last) ceph_check_caps(ci, 0, NULL); else if (flushsnaps) ceph_flush_snaps(ci, NULL); @@ -3032,7 +3075,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, spin_unlock(&ci->i_ceph_lock); if (last) { - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + ceph_check_caps(ci, 0, NULL); } else if (flush_snaps) { ceph_flush_snaps(ci, NULL); } @@ -3133,7 +3176,7 @@ static void handle_cap_grant(struct inode *inode, * try to invalidate (once). (If there are dirty buffers, we * will invalidate _after_ writeback.) */ - if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ + if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */ ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { @@ -3297,11 +3340,12 @@ static void handle_cap_grant(struct inode *inode, ceph_cap_string(cap->issued), ceph_cap_string(newcaps), ceph_cap_string(revoking)); - if (revoking & used & CEPH_CAP_FILE_BUFFER) + if (S_ISREG(inode->i_mode) && + (revoking & used & CEPH_CAP_FILE_BUFFER)) writeback = true; /* initiate writeback; will delay ack */ - else if (revoking == CEPH_CAP_FILE_CACHE && - (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && - queue_invalidate) + else if (queue_invalidate && + revoking == CEPH_CAP_FILE_CACHE && + (newcaps & CEPH_CAP_FILE_LAZYIO) == 0) ; /* do nothing yet, invalidation will be queued */ else if (cap == ci->i_auth_cap) check_caps = 1; /* check auth cap only */ @@ -3339,7 +3383,8 @@ static void handle_cap_grant(struct inode *inode, if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { if (newcaps & ~extra_info->issued) wake = true; - kick_flushing_inode_caps(session->s_mdsc, session, inode); + ceph_kick_flushing_inode_caps(session, ci); + spin_unlock(&ci->i_ceph_lock); up_read(&session->s_mdsc->snap_rwsem); } else { spin_unlock(&ci->i_ceph_lock); @@ -3367,10 +3412,10 @@ static void handle_cap_grant(struct inode *inode, wake_up_all(&ci->i_cap_wq); if (check_caps == 1) - ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL, session); else if (check_caps == 2) - ceph_check_caps(ci, CHECK_CAPS_NODELAY, session); + ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session); else mutex_unlock(&session->s_mutex); } @@ -3619,8 +3664,6 @@ retry: goto out_unlock; if (target < 0) { - if (cap->mds_wanted | cap->issued) - ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; __ceph_remove_cap(cap, false); goto out_unlock; } @@ -3668,7 +3711,7 @@ retry: /* add placeholder for the export tagert */ int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; tcap = new_cap; - ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, + ceph_add_cap(inode, tsession, t_cap_id, issued, 0, t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); if (!list_empty(&ci->i_cap_flush_list) && @@ -3773,7 +3816,7 @@ retry: __ceph_caps_issued(ci, &issued); issued |= __ceph_caps_dirty(ci); - ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq, + ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, &new_cap); ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; @@ -4047,7 +4090,6 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) { struct inode *inode; struct ceph_inode_info *ci; - int flags = CHECK_CAPS_NODELAY; dout("check_delayed_caps\n"); while (1) { @@ -4067,7 +4109,7 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) if (inode) { dout("check_delayed_caps on %p\n", inode); - ceph_check_caps(ci, flags, NULL); + ceph_check_caps(ci, 0, NULL); /* avoid calling iput_final() in tick thread */ ceph_async_iput(inode); } @@ -4092,7 +4134,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) ihold(inode); dout("flush_dirty_caps %p\n", inode); spin_unlock(&mdsc->cap_dirty_lock); - ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL); + ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); iput(inode); spin_lock(&mdsc->cap_dirty_lock); } @@ -4100,14 +4142,31 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) dout("flush_dirty_caps done\n"); } -void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode) +void __ceph_touch_fmode(struct ceph_inode_info *ci, + struct ceph_mds_client *mdsc, int fmode) +{ + unsigned long now = jiffies; + if (fmode & CEPH_FILE_MODE_RD) + ci->i_last_rd = now; + if (fmode & CEPH_FILE_MODE_WR) + ci->i_last_wr = now; + /* queue periodic check */ + if (fmode && + __ceph_is_any_real_caps(ci) && + list_empty(&ci->i_cap_delay_list)) + __cap_delay_requeue(mdsc, ci); +} + +void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) { int i; int bits = (fmode << 1) | 1; + spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { if (bits & (1 << i)) - ci->i_nr_by_mode[i]++; + ci->i_nr_by_mode[i] += count; } + spin_unlock(&ci->i_ceph_lock); } /* @@ -4115,26 +4174,18 @@ void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode) * we may need to release capabilities to the MDS (or schedule * their delayed release). */ -void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) +void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count) { - int i, last = 0; + int i; int bits = (fmode << 1) | 1; spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { if (bits & (1 << i)) { - BUG_ON(ci->i_nr_by_mode[i] == 0); - if (--ci->i_nr_by_mode[i] == 0) - last++; + BUG_ON(ci->i_nr_by_mode[i] < count); + ci->i_nr_by_mode[i] -= count; } } - dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n", - &ci->vfs_inode, fmode, - ci->i_nr_by_mode[0], ci->i_nr_by_mode[1], - ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]); spin_unlock(&ci->i_ceph_lock); - - if (last && ci->i_vino.snap == CEPH_NOSNAP) - ceph_check_caps(ci, 0, NULL); } /* @@ -4152,7 +4203,6 @@ int ceph_drop_caps_for_unlink(struct inode *inode) if (inode->i_nlink == 1) { drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); - ci->i_ceph_flags |= CEPH_I_NODELAY; if (__ceph_caps_dirty(ci)) { struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; @@ -4208,8 +4258,6 @@ int ceph_encode_inode_release(void **p, struct inode *inode, if (force || (cap->issued & drop)) { if (cap->issued & drop) { int wanted = __ceph_caps_wanted(ci); - if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0) - wanted |= cap->mds_wanted; dout("encode_inode_release %p cap %p " "%s -> %s, wanted %s -> %s\n", inode, cap, ceph_cap_string(cap->issued), diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index fb7cabd98e7b..481ac97b4d25 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -218,10 +218,10 @@ static int mds_sessions_show(struct seq_file *s, void *ptr) return 0; } -CEPH_DEFINE_SHOW_FUNC(mdsmap_show) -CEPH_DEFINE_SHOW_FUNC(mdsc_show) -CEPH_DEFINE_SHOW_FUNC(caps_show) -CEPH_DEFINE_SHOW_FUNC(mds_sessions_show) +DEFINE_SHOW_ATTRIBUTE(mdsmap); +DEFINE_SHOW_ATTRIBUTE(mdsc); +DEFINE_SHOW_ATTRIBUTE(caps); +DEFINE_SHOW_ATTRIBUTE(mds_sessions); /* @@ -281,25 +281,25 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) 0400, fsc->client->debugfs_dir, fsc, - &mdsmap_show_fops); + &mdsmap_fops); fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions", 0400, fsc->client->debugfs_dir, fsc, - &mds_sessions_show_fops); + &mds_sessions_fops); fsc->debugfs_mdsc = debugfs_create_file("mdsc", 0400, fsc->client->debugfs_dir, fsc, - &mdsc_show_fops); + &mdsc_fops); fsc->debugfs_caps = debugfs_create_file("caps", 0400, fsc->client->debugfs_dir, fsc, - &caps_show_fops); + &caps_fops); } diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index d0cd0aba5843..d594c2627430 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -335,8 +335,11 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ctx->pos = 2; } - /* can we use the dcache? */ spin_lock(&ci->i_ceph_lock); + /* request Fx cap. if have Fx, we don't need to release Fs cap + * for later create/unlink. */ + __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_WR); + /* can we use the dcache? */ if (ceph_test_mount_opt(fsc, DCACHE) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && @@ -752,7 +755,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, struct ceph_dentry_info *di = ceph_dentry(dentry); spin_lock(&ci->i_ceph_lock); - dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); + dout(" dir %p flags are 0x%lx\n", dir, ci->i_ceph_flags); if (strncmp(dentry->d_name.name, fsc->mount_options->snapdir_name, dentry->d_name.len) && @@ -760,6 +763,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, ceph_test_mount_opt(fsc, DCACHE) && __ceph_dir_is_complete(ci) && (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { + __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD); spin_unlock(&ci->i_ceph_lock); dout(" dir %p complete, -ENOENT\n", dir); d_add(dentry, NULL); @@ -1036,6 +1040,78 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, return err; } +static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + int result = req->r_err ? req->r_err : + le32_to_cpu(req->r_reply_info.head->result); + + if (result == -EJUKEBOX) + goto out; + + /* If op failed, mark everyone involved for errors */ + if (result) { + int pathlen; + u64 base; + char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, + &base, 0); + + /* mark error on parent + clear complete */ + mapping_set_error(req->r_parent->i_mapping, result); + ceph_dir_clear_complete(req->r_parent); + + /* drop the dentry -- we don't know its status */ + if (!d_unhashed(req->r_dentry)) + d_drop(req->r_dentry); + + /* mark inode itself for an error (since metadata is bogus) */ + mapping_set_error(req->r_old_inode->i_mapping, result); + + pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n", + base, IS_ERR(path) ? "<<bad>>" : path, result); + ceph_mdsc_free_path(path, pathlen); + } +out: + iput(req->r_old_inode); + ceph_mdsc_release_dir_caps(req); +} + +static int get_caps_for_async_unlink(struct inode *dir, struct dentry *dentry) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di; + int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_UNLINK; + + spin_lock(&ci->i_ceph_lock); + if ((__ceph_caps_issued(ci, NULL) & want) == want) { + ceph_take_cap_refs(ci, want, false); + got = want; + } + spin_unlock(&ci->i_ceph_lock); + + /* If we didn't get anything, return 0 */ + if (!got) + return 0; + + spin_lock(&dentry->d_lock); + di = ceph_dentry(dentry); + /* + * - We are holding Fx, which implies Fs caps. + * - Only support async unlink for primary linkage + */ + if (atomic_read(&ci->i_shared_gen) != di->lease_shared_gen || + !(di->flags & CEPH_DENTRY_PRIMARY_LINK)) + want = 0; + spin_unlock(&dentry->d_lock); + + /* Do we still want what we've got? */ + if (want == got) + return got; + + ceph_put_cap_refs(ci, got); + return 0; +} + /* * rmdir and unlink are differ only by the metadata op code */ @@ -1045,6 +1121,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = d_inode(dentry); struct ceph_mds_request *req; + bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS); int err = -EROFS; int op; @@ -1059,6 +1136,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; } else goto out; +retry: req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) { err = PTR_ERR(req); @@ -1067,13 +1145,39 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) req->r_dentry = dget(dentry); req->r_num_caps = 2; req->r_parent = dir; - set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; req->r_inode_drop = ceph_drop_caps_for_unlink(inode); - err = ceph_mdsc_do_request(mdsc, dir, req); - if (!err && !req->r_reply_info.head->is_dentry) - d_delete(dentry); + + if (try_async && op == CEPH_MDS_OP_UNLINK && + (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) { + dout("async unlink on %lu/%.*s caps=%s", dir->i_ino, + dentry->d_name.len, dentry->d_name.name, + ceph_cap_string(req->r_dir_caps)); + set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); + req->r_callback = ceph_async_unlink_cb; + req->r_old_inode = d_inode(dentry); + ihold(req->r_old_inode); + err = ceph_mdsc_submit_request(mdsc, dir, req); + if (!err) { + /* + * We have enough caps, so we assume that the unlink + * will succeed. Fix up the target inode and dcache. + */ + drop_nlink(inode); + d_delete(dentry); + } else if (err == -EJUKEBOX) { + try_async = false; + ceph_mdsc_put_request(req); + goto retry; + } + } else { + set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); + err = ceph_mdsc_do_request(mdsc, dir, req); + if (!err && !req->r_reply_info.head->is_dentry) + d_delete(dentry); + } + ceph_mdsc_put_request(req); out: return err; @@ -1411,6 +1515,7 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry) spin_lock(&dentry->d_lock); di->time = jiffies; di->lease_shared_gen = 0; + di->flags &= ~CEPH_DENTRY_PRIMARY_LINK; __dentry_lease_unlist(di); spin_unlock(&dentry->d_lock); } @@ -1520,7 +1625,8 @@ static int __dir_lease_try_check(const struct dentry *dentry) /* * Check if directory-wide content lease/cap is valid. */ -static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) +static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry, + struct ceph_mds_client *mdsc) { struct ceph_inode_info *ci = ceph_inode(dir); int valid; @@ -1528,7 +1634,10 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) spin_lock(&ci->i_ceph_lock); valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); - shared_gen = atomic_read(&ci->i_shared_gen); + if (valid) { + __ceph_touch_fmode(ci, mdsc, CEPH_FILE_MODE_RD); + shared_gen = atomic_read(&ci->i_shared_gen); + } spin_unlock(&ci->i_ceph_lock); if (valid) { struct ceph_dentry_info *di; @@ -1554,6 +1663,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) int valid = 0; struct dentry *parent; struct inode *dir, *inode; + struct ceph_mds_client *mdsc; if (flags & LOOKUP_RCU) { parent = READ_ONCE(dentry->d_parent); @@ -1570,6 +1680,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) dout("d_revalidate %p '%pd' inode %p offset 0x%llx\n", dentry, dentry, inode, ceph_dentry(dentry)->offset); + mdsc = ceph_sb_to_client(dir->i_sb)->mdsc; + /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry, @@ -1581,7 +1693,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) valid = dentry_lease_is_valid(dentry, flags); if (valid == -ECHILD) return valid; - if (valid || dir_lease_is_valid(dir, dentry)) { + if (valid || dir_lease_is_valid(dir, dentry, mdsc)) { if (inode) valid = ceph_is_any_caps(inode); else @@ -1590,8 +1702,6 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) } if (!valid) { - struct ceph_mds_client *mdsc = - ceph_sb_to_client(dir->i_sb)->mdsc; struct ceph_mds_request *req; int op, err; u32 mask; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index b6bfa94332c3..79dc06881e78 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -315,6 +315,11 @@ static struct dentry *__get_parent(struct super_block *sb, req->r_num_caps = 1; err = ceph_mdsc_do_request(mdsc, NULL, req); + if (err) { + ceph_mdsc_put_request(req); + return ERR_PTR(err); + } + inode = req->r_target_inode; if (inode) ihold(inode); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 5a478cd06e11..4a5ccbb7e808 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -212,10 +212,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, if (isdir) { struct ceph_dir_file_info *dfi = kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); - if (!dfi) { - ceph_put_fmode(ci, fmode); /* clean up */ + if (!dfi) return -ENOMEM; - } file->private_data = dfi; fi = &dfi->file_info; @@ -223,15 +221,15 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, dfi->readdir_cache_idx = -1; } else { fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); - if (!fi) { - ceph_put_fmode(ci, fmode); /* clean up */ + if (!fi) return -ENOMEM; - } file->private_data = fi; } + ceph_get_fmode(ci, fmode, 1); fi->fmode = fmode; + spin_lock_init(&fi->rw_contexts_lock); INIT_LIST_HEAD(&fi->rw_contexts); fi->meta_err = errseq_sample(&ci->i_meta_err); @@ -263,7 +261,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) case S_IFLNK: dout("init_file %p %p 0%o (symlink)\n", inode, file, inode->i_mode); - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ break; default: @@ -273,7 +270,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) * we need to drop the open ref now, since we don't * have .release set to ceph_release. */ - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ BUG_ON(inode->i_fop->release == ceph_release); /* call the proper open fop */ @@ -285,14 +281,15 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) /* * try renew caps after session gets killed. */ -int ceph_renew_caps(struct inode *inode) +int ceph_renew_caps(struct inode *inode, int fmode) { - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req; int err, flags, wanted; spin_lock(&ci->i_ceph_lock); + __ceph_touch_fmode(ci, mdsc, fmode); wanted = __ceph_caps_file_wanted(ci); if (__ceph_is_any_real_caps(ci) && (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) { @@ -326,7 +323,6 @@ int ceph_renew_caps(struct inode *inode) req->r_inode = inode; ihold(inode); req->r_num_caps = 1; - req->r_fmode = -1; err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); @@ -372,9 +368,6 @@ int ceph_open(struct inode *inode, struct file *file) /* trivially open snapdir */ if (ceph_snap(inode) == CEPH_SNAPDIR) { - spin_lock(&ci->i_ceph_lock); - __ceph_get_fmode(ci, fmode); - spin_unlock(&ci->i_ceph_lock); return ceph_init_file(inode, file, fmode); } @@ -392,7 +385,7 @@ int ceph_open(struct inode *inode, struct file *file) dout("open %p fmode %d want %s issued %s using existing\n", inode, fmode, ceph_cap_string(wanted), ceph_cap_string(issued)); - __ceph_get_fmode(ci, fmode); + __ceph_touch_fmode(ci, mdsc, fmode); spin_unlock(&ci->i_ceph_lock); /* adjust wanted? */ @@ -404,7 +397,7 @@ int ceph_open(struct inode *inode, struct file *file) return ceph_init_file(inode, file, fmode); } else if (ceph_snap(inode) != CEPH_NOSNAP && (ci->i_snap_caps & wanted) == wanted) { - __ceph_get_fmode(ci, fmode); + __ceph_touch_fmode(ci, mdsc, fmode); spin_unlock(&ci->i_ceph_lock); return ceph_init_file(inode, file, fmode); } @@ -430,6 +423,236 @@ out: return err; } +/* Clone the layout from a synchronous create, if the dir now has Dc caps */ +static void +cache_file_layout(struct inode *dst, struct inode *src) +{ + struct ceph_inode_info *cdst = ceph_inode(dst); + struct ceph_inode_info *csrc = ceph_inode(src); + + spin_lock(&cdst->i_ceph_lock); + if ((__ceph_caps_issued(cdst, NULL) & CEPH_CAP_DIR_CREATE) && + !ceph_file_layout_is_valid(&cdst->i_cached_layout)) { + memcpy(&cdst->i_cached_layout, &csrc->i_layout, + sizeof(cdst->i_cached_layout)); + rcu_assign_pointer(cdst->i_cached_layout.pool_ns, + ceph_try_get_string(csrc->i_layout.pool_ns)); + } + spin_unlock(&cdst->i_ceph_lock); +} + +/* + * Try to set up an async create. We need caps, a file layout, and inode number, + * and either a lease on the dentry or complete dir info. If any of those + * criteria are not satisfied, then return false and the caller can go + * synchronous. + */ +static int try_prep_async_create(struct inode *dir, struct dentry *dentry, + struct ceph_file_layout *lo, u64 *pino) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_dentry_info *di = ceph_dentry(dentry); + int got = 0, want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE; + u64 ino; + + spin_lock(&ci->i_ceph_lock); + /* No auth cap means no chance for Dc caps */ + if (!ci->i_auth_cap) + goto no_async; + + /* Any delegated inos? */ + if (xa_empty(&ci->i_auth_cap->session->s_delegated_inos)) + goto no_async; + + if (!ceph_file_layout_is_valid(&ci->i_cached_layout)) + goto no_async; + + if ((__ceph_caps_issued(ci, NULL) & want) != want) + goto no_async; + + if (d_in_lookup(dentry)) { + if (!__ceph_dir_is_complete(ci)) + goto no_async; + spin_lock(&dentry->d_lock); + di->lease_shared_gen = atomic_read(&ci->i_shared_gen); + spin_unlock(&dentry->d_lock); + } else if (atomic_read(&ci->i_shared_gen) != + READ_ONCE(di->lease_shared_gen)) { + goto no_async; + } + + ino = ceph_get_deleg_ino(ci->i_auth_cap->session); + if (!ino) + goto no_async; + + *pino = ino; + ceph_take_cap_refs(ci, want, false); + memcpy(lo, &ci->i_cached_layout, sizeof(*lo)); + rcu_assign_pointer(lo->pool_ns, + ceph_try_get_string(ci->i_cached_layout.pool_ns)); + got = want; +no_async: + spin_unlock(&ci->i_ceph_lock); + return got; +} + +static void restore_deleg_ino(struct inode *dir, u64 ino) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + struct ceph_mds_session *s = NULL; + + spin_lock(&ci->i_ceph_lock); + if (ci->i_auth_cap) + s = ceph_get_mds_session(ci->i_auth_cap->session); + spin_unlock(&ci->i_ceph_lock); + if (s) { + int err = ceph_restore_deleg_ino(s, ino); + if (err) + pr_warn("ceph: unable to restore delegated ino 0x%llx to session: %d\n", + ino, err); + ceph_put_mds_session(s); + } +} + +static void ceph_async_create_cb(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + int result = req->r_err ? req->r_err : + le32_to_cpu(req->r_reply_info.head->result); + + if (result == -EJUKEBOX) + goto out; + + mapping_set_error(req->r_parent->i_mapping, result); + + if (result) { + struct dentry *dentry = req->r_dentry; + int pathlen; + u64 base; + char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, + &base, 0); + + ceph_dir_clear_complete(req->r_parent); + if (!d_unhashed(dentry)) + d_drop(dentry); + + /* FIXME: start returning I/O errors on all accesses? */ + pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", + base, IS_ERR(path) ? "<<bad>>" : path, result); + ceph_mdsc_free_path(path, pathlen); + } + + if (req->r_target_inode) { + struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); + u64 ino = ceph_vino(req->r_target_inode).ino; + + if (req->r_deleg_ino != ino) + pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n", + __func__, req->r_err, req->r_deleg_ino, ino); + mapping_set_error(req->r_target_inode->i_mapping, result); + + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { + ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; + wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); + } + ceph_kick_flushing_inode_caps(req->r_session, ci); + spin_unlock(&ci->i_ceph_lock); + } else { + pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__, + req->r_deleg_ino); + } +out: + ceph_mdsc_release_dir_caps(req); +} + +static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry, + struct file *file, umode_t mode, + struct ceph_mds_request *req, + struct ceph_acl_sec_ctx *as_ctx, + struct ceph_file_layout *lo) +{ + int ret; + char xattr_buf[4]; + struct ceph_mds_reply_inode in = { }; + struct ceph_mds_reply_info_in iinfo = { .in = &in }; + struct ceph_inode_info *ci = ceph_inode(dir); + struct inode *inode; + struct timespec64 now; + struct ceph_vino vino = { .ino = req->r_deleg_ino, + .snap = CEPH_NOSNAP }; + + ktime_get_real_ts64(&now); + + inode = ceph_get_inode(dentry->d_sb, vino); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + iinfo.inline_version = CEPH_INLINE_NONE; + iinfo.change_attr = 1; + ceph_encode_timespec64(&iinfo.btime, &now); + + iinfo.xattr_len = ARRAY_SIZE(xattr_buf); + iinfo.xattr_data = xattr_buf; + memset(iinfo.xattr_data, 0, iinfo.xattr_len); + + in.ino = cpu_to_le64(vino.ino); + in.snapid = cpu_to_le64(CEPH_NOSNAP); + in.version = cpu_to_le64(1); // ??? + in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE); + in.cap.cap_id = cpu_to_le64(1); + in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino); + in.cap.flags = CEPH_CAP_FLAG_AUTH; + in.ctime = in.mtime = in.atime = iinfo.btime; + in.mode = cpu_to_le32((u32)mode); + in.truncate_seq = cpu_to_le32(1); + in.truncate_size = cpu_to_le64(-1ULL); + in.xattr_version = cpu_to_le64(1); + in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid())); + in.gid = cpu_to_le32(from_kgid(&init_user_ns, dir->i_mode & S_ISGID ? + dir->i_gid : current_fsgid())); + in.nlink = cpu_to_le32(1); + in.max_size = cpu_to_le64(lo->stripe_unit); + + ceph_file_layout_to_legacy(lo, &in.layout); + + ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session, + req->r_fmode, NULL); + if (ret) { + dout("%s failed to fill inode: %d\n", __func__, ret); + ceph_dir_clear_complete(dir); + if (!d_unhashed(dentry)) + d_drop(dentry); + if (inode->i_state & I_NEW) + discard_new_inode(inode); + } else { + struct dentry *dn; + + dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__, + vino.ino, dir->i_ino, dentry->d_name.name); + ceph_dir_clear_ordered(dir); + ceph_init_inode_acls(inode, as_ctx); + if (inode->i_state & I_NEW) { + /* + * If it's not I_NEW, then someone created this before + * we got here. Assume the server is aware of it at + * that point and don't worry about setting + * CEPH_I_ASYNC_CREATE. + */ + ceph_inode(inode)->i_ceph_flags = CEPH_I_ASYNC_CREATE; + unlock_new_inode(inode); + } + if (d_in_lookup(dentry) || d_really_is_negative(dentry)) { + if (!d_unhashed(dentry)) + d_drop(dentry); + dn = d_splice_alias(inode, dentry); + WARN_ON_ONCE(dn && dn != dentry); + } + file->f_mode |= FMODE_CREATED; + ret = finish_open(file, dentry, ceph_open); + } + return ret; +} /* * Do a lookup + open with a single request. If we get a non-existent @@ -443,6 +666,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct ceph_mds_request *req; struct dentry *dn; struct ceph_acl_sec_ctx as_ctx = {}; + bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS); int mask; int err; @@ -466,7 +690,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, /* If it's not being looked up, it's negative */ return -ENOENT; } - +retry: /* do the open */ req = prepare_open_request(dir->i_sb, flags, mode); if (IS_ERR(req)) { @@ -475,21 +699,43 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, } req->r_dentry = dget(dentry); req->r_num_caps = 2; + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; + if (ceph_security_xattr_wanted(dir)) + mask |= CEPH_CAP_XATTR_SHARED; + req->r_args.open.mask = cpu_to_le32(mask); + req->r_parent = dir; + if (flags & O_CREAT) { + struct ceph_file_layout lo; + req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; if (as_ctx.pagelist) { req->r_pagelist = as_ctx.pagelist; as_ctx.pagelist = NULL; } + if (try_async && + (req->r_dir_caps = + try_prep_async_create(dir, dentry, &lo, + &req->r_deleg_ino))) { + set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); + req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL); + req->r_callback = ceph_async_create_cb; + err = ceph_mdsc_submit_request(mdsc, dir, req); + if (!err) { + err = ceph_finish_async_create(dir, dentry, + file, mode, req, + &as_ctx, &lo); + } else if (err == -EJUKEBOX) { + restore_deleg_ino(dir, req->r_deleg_ino); + ceph_mdsc_put_request(req); + try_async = false; + goto retry; + } + goto out_req; + } } - mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; - if (ceph_security_xattr_wanted(dir)) - mask |= CEPH_CAP_XATTR_SHARED; - req->r_args.open.mask = cpu_to_le32(mask); - - req->r_parent = dir; set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, @@ -518,14 +764,15 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, } else { dout("atomic_open finish_open on dn %p\n", dn); if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { - ceph_init_inode_acls(d_inode(dentry), &as_ctx); + struct inode *newino = d_inode(dentry); + + cache_file_layout(dir, newino); + ceph_init_inode_acls(newino, &as_ctx); file->f_mode |= FMODE_CREATED; } err = finish_open(file, dentry, ceph_open); } out_req: - if (!req->r_err && req->r_target_inode) - ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); ceph_mdsc_put_request(req); out_ctx: ceph_release_acl_sec_ctx(&as_ctx); @@ -542,7 +789,7 @@ int ceph_release(struct inode *inode, struct file *file) dout("release inode %p dir file %p\n", inode, file); WARN_ON(!list_empty(&dfi->file_info.rw_contexts)); - ceph_put_fmode(ci, dfi->file_info.fmode); + ceph_put_fmode(ci, dfi->file_info.fmode, 1); if (dfi->last_readdir) ceph_mdsc_put_request(dfi->last_readdir); @@ -554,7 +801,8 @@ int ceph_release(struct inode *inode, struct file *file) dout("release inode %p regular file %p\n", inode, file); WARN_ON(!list_empty(&fi->rw_contexts)); - ceph_put_fmode(ci, fi->fmode); + ceph_put_fmode(ci, fi->fmode, 1); + kmem_cache_free(ceph_file_cachep, fi); } @@ -1567,7 +1815,7 @@ retry_snap: if (dirty) __mark_inode_dirty(inode, dirty); if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos)) - ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL); + ceph_check_caps(ci, 0, NULL); } dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", @@ -1944,6 +2192,71 @@ static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode, return 0; } +static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off, + struct ceph_inode_info *dst_ci, u64 *dst_off, + struct ceph_fs_client *fsc, + size_t len, unsigned int flags) +{ + struct ceph_object_locator src_oloc, dst_oloc; + struct ceph_object_id src_oid, dst_oid; + size_t bytes = 0; + u64 src_objnum, src_objoff, dst_objnum, dst_objoff; + u32 src_objlen, dst_objlen; + u32 object_size = src_ci->i_layout.object_size; + int ret; + + src_oloc.pool = src_ci->i_layout.pool_id; + src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns); + dst_oloc.pool = dst_ci->i_layout.pool_id; + dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns); + + while (len >= object_size) { + ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off, + object_size, &src_objnum, + &src_objoff, &src_objlen); + ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off, + object_size, &dst_objnum, + &dst_objoff, &dst_objlen); + ceph_oid_init(&src_oid); + ceph_oid_printf(&src_oid, "%llx.%08llx", + src_ci->i_vino.ino, src_objnum); + ceph_oid_init(&dst_oid); + ceph_oid_printf(&dst_oid, "%llx.%08llx", + dst_ci->i_vino.ino, dst_objnum); + /* Do an object remote copy */ + ret = ceph_osdc_copy_from(&fsc->client->osdc, + src_ci->i_vino.snap, 0, + &src_oid, &src_oloc, + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE, + &dst_oid, &dst_oloc, + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, + dst_ci->i_truncate_seq, + dst_ci->i_truncate_size, + CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ); + if (ret) { + if (ret == -EOPNOTSUPP) { + fsc->have_copy_from2 = false; + pr_notice("OSDs don't support copy-from2; disabling copy offload\n"); + } + dout("ceph_osdc_copy_from returned %d\n", ret); + if (!bytes) + bytes = ret; + goto out; + } + len -= object_size; + bytes += object_size; + *src_off += object_size; + *dst_off += object_size; + } + +out: + ceph_oloc_destroy(&src_oloc); + ceph_oloc_destroy(&dst_oloc); + return bytes; +} + static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, size_t len, unsigned int flags) @@ -1954,14 +2267,11 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); struct ceph_cap_flush *prealloc_cf; struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode); - struct ceph_object_locator src_oloc, dst_oloc; - struct ceph_object_id src_oid, dst_oid; - loff_t endoff = 0, size; - ssize_t ret = -EIO; + loff_t size; + ssize_t ret = -EIO, bytes; u64 src_objnum, dst_objnum, src_objoff, dst_objoff; - u32 src_objlen, dst_objlen, object_size; + u32 src_objlen, dst_objlen; int src_got = 0, dst_got = 0, err, dirty; - bool do_final_copy = false; if (src_inode->i_sb != dst_inode->i_sb) { struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode); @@ -2039,22 +2349,14 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, if (ret < 0) goto out_caps; - size = i_size_read(dst_inode); - endoff = dst_off + len; - /* Drop dst file cached pages */ ret = invalidate_inode_pages2_range(dst_inode->i_mapping, dst_off >> PAGE_SHIFT, - endoff >> PAGE_SHIFT); + (dst_off + len) >> PAGE_SHIFT); if (ret < 0) { dout("Failed to invalidate inode pages (%zd)\n", ret); ret = 0; /* XXX */ } - src_oloc.pool = src_ci->i_layout.pool_id; - src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns); - dst_oloc.pool = dst_ci->i_layout.pool_id; - dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns); - ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, src_ci->i_layout.object_size, &src_objnum, &src_objoff, &src_objlen); @@ -2073,6 +2375,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, * starting at the src_off */ if (src_objoff) { + dout("Initial partial copy of %u bytes\n", src_objlen); + /* * we need to temporarily drop all caps as we'll be calling * {read,write}_iter, which will get caps again. @@ -2080,8 +2384,9 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); ret = do_splice_direct(src_file, &src_off, dst_file, &dst_off, src_objlen, flags); - if (ret < 0) { - dout("do_splice_direct returned %d\n", err); + /* Abort on short copies or on error */ + if (ret < src_objlen) { + dout("Failed partial copy (%zd)\n", ret); goto out; } len -= ret; @@ -2094,65 +2399,27 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, if (err < 0) goto out_caps; } - object_size = src_ci->i_layout.object_size; - while (len >= object_size) { - ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, - object_size, &src_objnum, - &src_objoff, &src_objlen); - ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off, - object_size, &dst_objnum, - &dst_objoff, &dst_objlen); - ceph_oid_init(&src_oid); - ceph_oid_printf(&src_oid, "%llx.%08llx", - src_ci->i_vino.ino, src_objnum); - ceph_oid_init(&dst_oid); - ceph_oid_printf(&dst_oid, "%llx.%08llx", - dst_ci->i_vino.ino, dst_objnum); - /* Do an object remote copy */ - err = ceph_osdc_copy_from( - &src_fsc->client->osdc, - src_ci->i_vino.snap, 0, - &src_oid, &src_oloc, - CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | - CEPH_OSD_OP_FLAG_FADVISE_NOCACHE, - &dst_oid, &dst_oloc, - CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | - CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, - dst_ci->i_truncate_seq, dst_ci->i_truncate_size, - CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ); - if (err) { - if (err == -EOPNOTSUPP) { - src_fsc->have_copy_from2 = false; - pr_notice("OSDs don't support copy-from2; disabling copy offload\n"); - } - dout("ceph_osdc_copy_from returned %d\n", err); - if (!ret) - ret = err; - goto out_caps; - } - len -= object_size; - src_off += object_size; - dst_off += object_size; - ret += object_size; - } - if (len) - /* We still need one final local copy */ - do_final_copy = true; + size = i_size_read(dst_inode); + bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off, + src_fsc, len, flags); + if (bytes <= 0) { + if (!ret) + ret = bytes; + goto out_caps; + } + dout("Copied %zu bytes out of %zu\n", bytes, len); + len -= bytes; + ret += bytes; file_update_time(dst_file); inode_inc_iversion_raw(dst_inode); - if (endoff > size) { - int caps_flags = 0; - + if (dst_off > size) { /* Let the MDS know about dst file size change */ - if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff)) - caps_flags |= CHECK_CAPS_NODELAY; - if (ceph_inode_set_size(dst_inode, endoff)) - caps_flags |= CHECK_CAPS_AUTHONLY; - if (caps_flags) - ceph_check_caps(dst_ci, caps_flags, NULL); + if (ceph_inode_set_size(dst_inode, dst_off) || + ceph_quota_is_max_bytes_approaching(dst_inode, dst_off)) + ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL); } /* Mark Fw dirty */ spin_lock(&dst_ci->i_ceph_lock); @@ -2165,15 +2432,18 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, out_caps: put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); - if (do_final_copy) { - err = do_splice_direct(src_file, &src_off, dst_file, - &dst_off, len, flags); - if (err < 0) { - dout("do_splice_direct returned %d\n", err); - goto out; - } - len -= err; - ret += err; + /* + * Do the final manual copy if we still have some bytes left, unless + * there were errors in remote object copies (len >= object_size). + */ + if (len && (len < src_ci->i_layout.object_size)) { + dout("Final partial copy of %zu bytes\n", len); + bytes = do_splice_direct(src_file, &src_off, dst_file, + &dst_off, len, flags); + if (bytes > 0) + ret += bytes; + else + dout("Failed partial copy (%zd)\n", bytes); } out: diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index d01710a16a4a..7fef94fd1e55 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -82,10 +82,14 @@ struct inode *ceph_get_snapdir(struct inode *parent) inode->i_mode = parent->i_mode; inode->i_uid = parent->i_uid; inode->i_gid = parent->i_gid; + inode->i_mtime = parent->i_mtime; + inode->i_ctime = parent->i_ctime; + inode->i_atime = parent->i_atime; inode->i_op = &ceph_snapdir_iops; inode->i_fop = &ceph_snapdir_fops; ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ ci->i_rbytes = 0; + ci->i_btime = ceph_inode(parent)->i_btime; if (inode->i_state & I_NEW) unlock_new_inode(inode); @@ -447,6 +451,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_max_files = 0; memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); + memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL); ci->i_fragtree = RB_ROOT; @@ -471,13 +476,13 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_prealloc_cap_flush = NULL; INIT_LIST_HEAD(&ci->i_cap_flush_list); init_waitqueue_head(&ci->i_cap_wq); - ci->i_hold_caps_min = 0; ci->i_hold_caps_max = 0; INIT_LIST_HEAD(&ci->i_cap_delay_list); INIT_LIST_HEAD(&ci->i_cap_snaps); ci->i_head_snapc = NULL; ci->i_snap_caps = 0; + ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ; for (i = 0; i < CEPH_FILE_MODE_BITS; i++) ci->i_nr_by_mode[i] = 0; @@ -496,6 +501,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_rdcache_ref = 0; ci->i_wr_ref = 0; ci->i_wb_ref = 0; + ci->i_fx_ref = 0; ci->i_wrbuffer_ref = 0; ci->i_wrbuffer_ref_head = 0; atomic_set(&ci->i_filelock_ref, 0); @@ -586,6 +592,7 @@ void ceph_evict_inode(struct inode *inode) ceph_buffer_put(ci->i_xattrs.prealloc_blob); ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns)); + ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); } static inline blkcnt_t calc_inode_blocks(u64 size) @@ -636,7 +643,7 @@ int ceph_fill_file_size(struct inode *inode, int issued, if ((issued & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_BUFFER)) || mapping_mapped(inode->i_mapping) || - __ceph_caps_file_wanted(ci)) { + __ceph_is_file_opened(ci)) { ci->i_truncate_pending++; queue_trunc = 1; } @@ -727,11 +734,11 @@ void ceph_fill_file_time(struct inode *inode, int issued, * Populate an inode based on info from mds. May be called on new or * existing inodes. */ -static int fill_inode(struct inode *inode, struct page *locked_page, - struct ceph_mds_reply_info_in *iinfo, - struct ceph_mds_reply_dirfrag *dirinfo, - struct ceph_mds_session *session, int cap_fmode, - struct ceph_cap_reservation *caps_reservation) +int ceph_fill_inode(struct inode *inode, struct page *locked_page, + struct ceph_mds_reply_info_in *iinfo, + struct ceph_mds_reply_dirfrag *dirinfo, + struct ceph_mds_session *session, int cap_fmode, + struct ceph_cap_reservation *caps_reservation) { struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_reply_inode *info = iinfo->in; @@ -748,7 +755,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, bool new_version = false; bool fill_inline = false; - dout("fill_inode %p ino %llx.%llx v %llu had %llu\n", + dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__, inode, ceph_vinop(inode), le64_to_cpu(info->version), ci->i_version); @@ -769,7 +776,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, if (iinfo->xattr_len > 4) { xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS); if (!xattr_blob) - pr_err("fill_inode ENOMEM xattr blob %d bytes\n", + pr_err("%s ENOMEM xattr blob %d bytes\n", __func__, iinfo->xattr_len); } @@ -932,8 +939,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page, spin_unlock(&ci->i_ceph_lock); if (symlen != i_size_read(inode)) { - pr_err("fill_inode %llx.%llx BAD symlink " - "size %lld\n", ceph_vinop(inode), + pr_err("%s %llx.%llx BAD symlink " + "size %lld\n", __func__, + ceph_vinop(inode), i_size_read(inode)); i_size_write(inode, symlen); inode->i_blocks = calc_inode_blocks(symlen); @@ -957,7 +965,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, inode->i_fop = &ceph_dir_fops; break; default: - pr_err("fill_inode %llx.%llx BAD mode 0%o\n", + pr_err("%s %llx.%llx BAD mode 0%o\n", __func__, ceph_vinop(inode), inode->i_mode); } @@ -966,7 +974,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, if (ceph_snap(inode) == CEPH_NOSNAP) { ceph_add_cap(inode, session, le64_to_cpu(info->cap.cap_id), - cap_fmode, info_caps, + info_caps, le32_to_cpu(info->cap.wanted), le32_to_cpu(info->cap.seq), le32_to_cpu(info->cap.mseq), @@ -991,13 +999,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, dout(" %p got snap_caps %s\n", inode, ceph_cap_string(info_caps)); ci->i_snap_caps |= info_caps; - if (cap_fmode >= 0) - __ceph_get_fmode(ci, cap_fmode); } - } else if (cap_fmode >= 0) { - pr_warn("mds issued no caps on %llx.%llx\n", - ceph_vinop(inode)); - __ceph_get_fmode(ci, cap_fmode); } if (iinfo->inline_version > 0 && @@ -1009,6 +1011,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page, fill_inline = true; } + if (cap_fmode >= 0) { + if (!info_caps) + pr_warn("mds issued no caps on %llx.%llx\n", + ceph_vinop(inode)); + __ceph_touch_fmode(ci, mdsc, cap_fmode); + } + spin_unlock(&ci->i_ceph_lock); if (fill_inline) @@ -1050,6 +1059,7 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry, struct ceph_mds_session **old_lease_session) { struct ceph_dentry_info *di = ceph_dentry(dentry); + unsigned mask = le16_to_cpu(lease->mask); long unsigned duration = le32_to_cpu(lease->duration_ms); long unsigned ttl = from_time + (duration * HZ) / 1000; long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; @@ -1061,8 +1071,13 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry, if (ceph_snap(dir) != CEPH_NOSNAP) return; + if (mask & CEPH_LEASE_PRIMARY_LINK) + di->flags |= CEPH_DENTRY_PRIMARY_LINK; + else + di->flags &= ~CEPH_DENTRY_PRIMARY_LINK; + di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen); - if (duration == 0) { + if (!(mask & CEPH_LEASE_VALID)) { __ceph_dentry_dir_lease_touch(di); return; } @@ -1239,10 +1254,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) struct inode *dir = req->r_parent; if (dir) { - err = fill_inode(dir, NULL, - &rinfo->diri, rinfo->dirfrag, - session, -1, - &req->r_caps_reservation); + err = ceph_fill_inode(dir, NULL, &rinfo->diri, + rinfo->dirfrag, session, -1, + &req->r_caps_reservation); if (err < 0) goto done; } else { @@ -1307,13 +1321,14 @@ retry_lookup: goto done; } - err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, - session, + err = ceph_fill_inode(in, req->r_locked_page, &rinfo->targeti, + NULL, session, (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && + !test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && rinfo->head->result == 0) ? req->r_fmode : -1, &req->r_caps_reservation); if (err < 0) { - pr_err("fill_inode badness %p %llx.%llx\n", + pr_err("ceph_fill_inode badness %p %llx.%llx\n", in, ceph_vinop(in)); if (in->i_state & I_NEW) discard_new_inode(in); @@ -1500,10 +1515,11 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, dout("new_inode badness got %d\n", err); continue; } - rc = fill_inode(in, NULL, &rde->inode, NULL, session, - -1, &req->r_caps_reservation); + rc = ceph_fill_inode(in, NULL, &rde->inode, NULL, session, + -1, &req->r_caps_reservation); if (rc < 0) { - pr_err("fill_inode badness on %p got %d\n", in, rc); + pr_err("ceph_fill_inode badness on %p got %d\n", + in, rc); err = rc; if (in->i_state & I_NEW) { ihold(in); @@ -1707,10 +1723,10 @@ retry_lookup: } } - ret = fill_inode(in, NULL, &rde->inode, NULL, session, - -1, &req->r_caps_reservation); + ret = ceph_fill_inode(in, NULL, &rde->inode, NULL, session, + -1, &req->r_caps_reservation); if (ret < 0) { - pr_err("fill_inode badness on %p\n", in); + pr_err("ceph_fill_inode badness on %p\n", in); if (d_really_is_negative(dn)) { /* avoid calling iput_final() in mds * dispatch threads */ @@ -1972,7 +1988,7 @@ retry: mutex_unlock(&ci->i_truncate_mutex); if (wrbuffer_refs == 0) - ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + ceph_check_caps(ci, 0, NULL); wake_up_all(&ci->i_cap_wq); } diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index c90f03beb15d..6e061bf62ad4 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -243,11 +243,13 @@ static long ceph_ioctl_lazyio(struct file *file) struct ceph_file_info *fi = file->private_data; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { spin_lock(&ci->i_ceph_lock); fi->fmode |= CEPH_FILE_MODE_LAZY; ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++; + __ceph_touch_fmode(ci, mdsc, fi->fmode); spin_unlock(&ci->i_ceph_lock); dout("ioctl_layzio: file %p marked lazy\n", file); diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 544e9e85b120..d6b9166e71e4 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -210,6 +210,21 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, return 0; } +static int try_unlock_file(struct file *file, struct file_lock *fl) +{ + int err; + unsigned int orig_flags = fl->fl_flags; + fl->fl_flags |= FL_EXISTS; + err = locks_lock_file_wait(file, fl); + fl->fl_flags = orig_flags; + if (err == -ENOENT) { + if (!(orig_flags & FL_EXISTS)) + err = 0; + return err; + } + return 1; +} + /** * Attempt to set an fcntl lock. * For now, this just goes away to the server. Later it may be more awesome. @@ -255,9 +270,15 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) else lock_cmd = CEPH_LOCK_UNLOCK; + if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) { + err = try_unlock_file(file, fl); + if (err <= 0) + return err; + } + err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl); if (!err) { - if (op == CEPH_MDS_OP_SETFILELOCK) { + if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) { dout("mds locked, locking locally\n"); err = posix_lock_file(file, fl, NULL); if (err) { @@ -311,9 +332,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) else lock_cmd = CEPH_LOCK_UNLOCK; + if (F_UNLCK == fl->fl_type) { + err = try_unlock_file(file, fl); + if (err <= 0) + return err; + } + err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, inode, lock_cmd, wait, fl); - if (!err) { + if (!err && F_UNLCK != fl->fl_type) { err = locks_lock_file_wait(file, fl); if (err) { ceph_lock_message(CEPH_LOCK_FLOCK, diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index bbbbddf71326..486f91f9685b 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -415,21 +415,121 @@ bad: return -EIO; } + +#if BITS_PER_LONG == 64 + +#define DELEGATED_INO_AVAILABLE xa_mk_value(1) + +static int ceph_parse_deleg_inos(void **p, void *end, + struct ceph_mds_session *s) +{ + u32 sets; + + ceph_decode_32_safe(p, end, sets, bad); + dout("got %u sets of delegated inodes\n", sets); + while (sets--) { + u64 start, len, ino; + + ceph_decode_64_safe(p, end, start, bad); + ceph_decode_64_safe(p, end, len, bad); + while (len--) { + int err = xa_insert(&s->s_delegated_inos, ino = start++, + DELEGATED_INO_AVAILABLE, + GFP_KERNEL); + if (!err) { + dout("added delegated inode 0x%llx\n", + start - 1); + } else if (err == -EBUSY) { + pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n", + start - 1); + } else { + return err; + } + } + } + return 0; +bad: + return -EIO; +} + +u64 ceph_get_deleg_ino(struct ceph_mds_session *s) +{ + unsigned long ino; + void *val; + + xa_for_each(&s->s_delegated_inos, ino, val) { + val = xa_erase(&s->s_delegated_inos, ino); + if (val == DELEGATED_INO_AVAILABLE) + return ino; + } + return 0; +} + +int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino) +{ + return xa_insert(&s->s_delegated_inos, ino, DELEGATED_INO_AVAILABLE, + GFP_KERNEL); +} +#else /* BITS_PER_LONG == 64 */ +/* + * FIXME: xarrays can't handle 64-bit indexes on a 32-bit arch. For now, just + * ignore delegated_inos on 32 bit arch. Maybe eventually add xarrays for top + * and bottom words? + */ +static int ceph_parse_deleg_inos(void **p, void *end, + struct ceph_mds_session *s) +{ + u32 sets; + + ceph_decode_32_safe(p, end, sets, bad); + if (sets) + ceph_decode_skip_n(p, end, sets * 2 * sizeof(__le64), bad); + return 0; +bad: + return -EIO; +} + +u64 ceph_get_deleg_ino(struct ceph_mds_session *s) +{ + return 0; +} + +int ceph_restore_deleg_ino(struct ceph_mds_session *s, u64 ino) +{ + return 0; +} +#endif /* BITS_PER_LONG == 64 */ + /* * parse create results */ static int parse_reply_info_create(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - u64 features) + u64 features, struct ceph_mds_session *s) { + int ret; + if (features == (u64)-1 || (features & CEPH_FEATURE_REPLY_CREATE_INODE)) { - /* Malformed reply? */ if (*p == end) { + /* Malformed reply? */ info->has_create_ino = false; - } else { + } else if (test_bit(CEPHFS_FEATURE_DELEG_INO, &s->s_features)) { + u8 struct_v, struct_compat; + u32 len; + info->has_create_ino = true; + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_64_safe(p, end, info->ino, bad); + ret = ceph_parse_deleg_inos(p, end, s); + if (ret) + return ret; + } else { + /* legacy */ ceph_decode_64_safe(p, end, info->ino, bad); + info->has_create_ino = true; } } else { if (*p != end) @@ -448,7 +548,7 @@ bad: */ static int parse_reply_info_extra(void **p, void *end, struct ceph_mds_reply_info_parsed *info, - u64 features) + u64 features, struct ceph_mds_session *s) { u32 op = le32_to_cpu(info->head->op); @@ -457,7 +557,7 @@ static int parse_reply_info_extra(void **p, void *end, else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP) return parse_reply_info_readdir(p, end, info, features); else if (op == CEPH_MDS_OP_CREATE) - return parse_reply_info_create(p, end, info, features); + return parse_reply_info_create(p, end, info, features, s); else return -EIO; } @@ -465,7 +565,7 @@ static int parse_reply_info_extra(void **p, void *end, /* * parse entire mds reply */ -static int parse_reply_info(struct ceph_msg *msg, +static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg, struct ceph_mds_reply_info_parsed *info, u64 features) { @@ -490,7 +590,7 @@ static int parse_reply_info(struct ceph_msg *msg, ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { ceph_decode_need(&p, end, len, bad); - err = parse_reply_info_extra(&p, p+len, info, features); + err = parse_reply_info_extra(&p, p+len, info, features, s); if (err < 0) goto out_bad; } @@ -558,6 +658,7 @@ void ceph_put_mds_session(struct ceph_mds_session *s) if (refcount_dec_and_test(&s->s_ref)) { if (s->s_auth.authorizer) ceph_auth_destroy_authorizer(s->s_auth.authorizer); + xa_destroy(&s->s_delegated_inos); kfree(s); } } @@ -645,6 +746,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, refcount_set(&s->s_ref, 1); INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); + xa_init(&s->s_delegated_inos); s->s_num_cap_releases = 0; s->s_cap_reconnect = 0; s->s_cap_iterator = NULL; @@ -699,6 +801,7 @@ void ceph_mdsc_release_request(struct kref *kref) struct ceph_mds_request *req = container_of(kref, struct ceph_mds_request, r_kref); + ceph_mdsc_release_dir_caps(req); destroy_reply_info(&req->r_reply_info); if (req->r_request) ceph_msg_put(req->r_request); @@ -736,7 +839,7 @@ void ceph_mdsc_release_request(struct kref *kref) put_request_session(req); ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); WARN_ON_ONCE(!list_empty(&req->r_wait)); - kfree(req); + kmem_cache_free(ceph_mds_request_cachep, req); } DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node) @@ -793,8 +896,13 @@ static void __register_request(struct ceph_mds_client *mdsc, mdsc->oldest_tid = req->r_tid; if (dir) { + struct ceph_inode_info *ci = ceph_inode(dir); + ihold(dir); req->r_unsafe_dir = dir; + spin_lock(&ci->i_unsafe_lock); + list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops); + spin_unlock(&ci->i_unsafe_lock); } } @@ -822,8 +930,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc, erase_request(&mdsc->request_tree, req); - if (req->r_unsafe_dir && - test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { + if (req->r_unsafe_dir) { struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_dir_item); @@ -1407,8 +1514,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); spin_lock(&ci->i_ceph_lock); - if (cap->mds_wanted | cap->issued) - ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; __ceph_remove_cap(cap, false); if (!ci->i_auth_cap) { struct ceph_cap_flush *cf; @@ -1574,9 +1679,6 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, /* mds did not re-issue stale cap */ spin_lock(&ci->i_ceph_lock); cap->issued = cap->implemented = CEPH_CAP_PIN; - /* make sure mds knows what we want */ - if (__ceph_caps_file_wanted(ci) & ~cap->mds_wanted) - ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; spin_unlock(&ci->i_ceph_lock); } } else if (ev == FORCE_RO) { @@ -1772,7 +1874,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) } /* The inode has cached pages, but it's no longer used. * we can safely drop it */ - if (wanted == 0 && used == CEPH_CAP_FILE_CACHE && + if (S_ISREG(inode->i_mode) && + wanted == 0 && used == CEPH_CAP_FILE_CACHE && !(oissued & CEPH_CAP_FILE_CACHE)) { used = 0; oissued = 0; @@ -2089,8 +2192,9 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) { - struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS); + struct ceph_mds_request *req; + req = kmem_cache_zalloc(ceph_mds_request_cachep, GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); @@ -2368,7 +2472,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, head->op = cpu_to_le32(req->r_op); head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid)); head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid)); - head->ino = 0; + head->ino = cpu_to_le64(req->r_deleg_ino); head->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); @@ -2382,7 +2486,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, if (req->r_inode_drop) releases += ceph_encode_inode_release(&p, req->r_inode ? req->r_inode : d_inode(req->r_dentry), - mds, req->r_inode_drop, req->r_inode_unless, 0); + mds, req->r_inode_drop, req->r_inode_unless, + req->r_op == CEPH_MDS_OP_READDIR); if (req->r_dentry_drop) releases += ceph_encode_dentry_release(&p, req->r_dentry, req->r_parent, mds, req->r_dentry_drop, @@ -2522,12 +2627,13 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc)); if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) flags |= CEPH_MDS_FLAG_REPLAY; + if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) + flags |= CEPH_MDS_FLAG_ASYNC; if (req->r_parent) flags |= CEPH_MDS_FLAG_WANT_DENTRY; rhead->flags = cpu_to_le32(flags); rhead->num_fwd = req->r_num_fwd; rhead->num_retry = req->r_attempts - 1; - rhead->ino = 0; dout(" r_parent = %p\n", req->r_parent); return 0; @@ -2573,7 +2679,7 @@ static void __do_request(struct ceph_mds_client *mdsc, if (req->r_timeout && time_after_eq(jiffies, req->r_started + req->r_timeout)) { dout("do_request timed out\n"); - err = -EIO; + err = -ETIMEDOUT; goto finish; } if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { @@ -2605,6 +2711,10 @@ static void __do_request(struct ceph_mds_client *mdsc, mds = __choose_mds(mdsc, req, &random); if (mds < 0 || ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { + if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) { + err = -EJUKEBOX; + goto finish; + } dout("do_request no mds or not active, waiting for map\n"); list_add(&req->r_wait, &mdsc->waiting_for_map); return; @@ -2629,6 +2739,15 @@ static void __do_request(struct ceph_mds_client *mdsc, err = -EACCES; goto out_session; } + /* + * We cannot queue async requests since the caps and delegated + * inodes are bound to the session. Just return -EJUKEBOX and + * let the caller retry a sync request in that case. + */ + if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags)) { + err = -EJUKEBOX; + goto out_session; + } if (session->s_state == CEPH_MDS_SESSION_NEW || session->s_state == CEPH_MDS_SESSION_CLOSING) { __open_session(mdsc, session); @@ -2709,19 +2828,43 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req) { - int err; + int err = 0; /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */ if (req->r_inode) ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); if (req->r_parent) { - ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); + struct ceph_inode_info *ci = ceph_inode(req->r_parent); + int fmode = (req->r_op & CEPH_MDS_OP_WRITE) ? + CEPH_FILE_MODE_WR : CEPH_FILE_MODE_RD; + spin_lock(&ci->i_ceph_lock); + ceph_take_cap_refs(ci, CEPH_CAP_PIN, false); + __ceph_touch_fmode(ci, mdsc, fmode); + spin_unlock(&ci->i_ceph_lock); ihold(req->r_parent); } if (req->r_old_dentry_dir) ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); + if (req->r_inode) { + err = ceph_wait_on_async_create(req->r_inode); + if (err) { + dout("%s: wait for async create returned: %d\n", + __func__, err); + return err; + } + } + + if (!err && req->r_old_inode) { + err = ceph_wait_on_async_create(req->r_old_inode); + if (err) { + dout("%s: wait for async create returned: %d\n", + __func__, err); + return err; + } + } + dout("submit_request on %p for inode %p\n", req, dir); mutex_lock(&mdsc->mutex); __register_request(mdsc, req, dir); @@ -2747,7 +2890,7 @@ static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, if (timeleft > 0) err = 0; else if (!timeleft) - err = -EIO; /* timed out */ + err = -ETIMEDOUT; /* timed out */ else err = timeleft; /* killed */ } @@ -2935,22 +3078,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } else { set_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags); list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); - if (req->r_unsafe_dir) { - struct ceph_inode_info *ci = - ceph_inode(req->r_unsafe_dir); - spin_lock(&ci->i_unsafe_lock); - list_add_tail(&req->r_unsafe_dir_item, - &ci->i_unsafe_dirops); - spin_unlock(&ci->i_unsafe_lock); - } } dout("handle_reply tid %lld result %d\n", tid, result); rinfo = &req->r_reply_info; if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features)) - err = parse_reply_info(msg, rinfo, (u64)-1); + err = parse_reply_info(session, msg, rinfo, (u64)-1); else - err = parse_reply_info(msg, rinfo, session->s_con.peer_features); + err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features); mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); @@ -3249,6 +3384,17 @@ bad: return; } +void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req) +{ + int dcaps; + + dcaps = xchg(&req->r_dir_caps, 0); + if (dcaps) { + dout("releasing r_dir_caps=%s\n", ceph_cap_string(dcaps)); + ceph_put_cap_refs(ceph_inode(req->r_parent), dcaps); + } +} + /* * called under session->mutex. */ @@ -3276,9 +3422,14 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, continue; if (req->r_attempts == 0) continue; /* only old requests */ - if (req->r_session && - req->r_session->s_mds == session->s_mds) - __send_request(mdsc, session, req, true); + if (!req->r_session) + continue; + if (req->r_session->s_mds != session->s_mds) + continue; + + ceph_mdsc_release_dir_caps(req); + + __send_request(mdsc, session, req, true); } mutex_unlock(&mdsc->mutex); } @@ -3362,7 +3513,7 @@ fail_msg: /* * Encode information about a cap for a reconnect with the MDS. */ -static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, +static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { union { @@ -3385,6 +3536,15 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, cap->mseq = 0; /* and migrate_seq */ cap->cap_gen = cap->session->s_cap_gen; + /* These are lost when the session goes away */ + if (S_ISDIR(inode->i_mode)) { + if (cap->issued & CEPH_CAP_DIR_CREATE) { + ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); + memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); + } + cap->issued &= ~CEPH_CAP_ANY_DIR_OPS; + } + if (recon_state->msg_version >= 2) { rec.v2.cap_id = cpu_to_le64(cap->cap_id); rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); @@ -3626,6 +3786,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, if (!reply) goto fail_nomsg; + xa_destroy(&session->s_delegated_inos); + mutex_lock(&session->s_mutex); session->s_state = CEPH_MDS_SESSION_RECONNECTING; session->s_seq = 0; @@ -3681,7 +3843,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, recon_state.msg_version = 2; } /* trsaverse this session's caps */ - err = ceph_iterate_session_caps(session, encode_caps_cb, &recon_state); + err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state); spin_lock(&session->s_cap_lock); session->s_cap_reconnect = 0; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 27a7446e10d3..4e5be79bf080 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -23,8 +23,9 @@ enum ceph_feature_type { CEPHFS_FEATURE_RECLAIM_CLIENT, CEPHFS_FEATURE_LAZY_CAP_WANTED, CEPHFS_FEATURE_MULTI_RECONNECT, + CEPHFS_FEATURE_DELEG_INO, - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MULTI_RECONNECT, + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO, }; /* @@ -37,6 +38,7 @@ enum ceph_feature_type { CEPHFS_FEATURE_REPLY_ENCODING, \ CEPHFS_FEATURE_LAZY_CAP_WANTED, \ CEPHFS_FEATURE_MULTI_RECONNECT, \ + CEPHFS_FEATURE_DELEG_INO, \ \ CEPHFS_FEATURE_MAX, \ } @@ -201,6 +203,7 @@ struct ceph_mds_session { struct list_head s_waiting; /* waiting requests */ struct list_head s_unsafe; /* unsafe requests */ + struct xarray s_delegated_inos; }; /* @@ -255,6 +258,7 @@ struct ceph_mds_request { #define CEPH_MDS_R_GOT_RESULT (5) /* got a result */ #define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */ #define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */ +#define CEPH_MDS_R_ASYNC (8) /* async request */ unsigned long r_req_flags; struct mutex r_fill_mutex; @@ -263,6 +267,7 @@ struct ceph_mds_request { int r_fmode; /* file mode, if expecting cap */ kuid_t r_uid; kgid_t r_gid; + int r_request_release_offset; struct timespec64 r_stamp; /* for choosing which mds to send this request to */ @@ -280,12 +285,16 @@ struct ceph_mds_request { int r_old_inode_drop, r_old_inode_unless; struct ceph_msg *r_request; /* original request */ - int r_request_release_offset; struct ceph_msg *r_reply; struct ceph_mds_reply_info_parsed r_reply_info; - struct page *r_locked_page; int r_err; + + struct page *r_locked_page; + int r_dir_caps; + int r_num_caps; + u32 r_readdir_offset; + unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ unsigned long r_started; /* start time to measure timeout against */ unsigned long r_request_started; /* start time for mds request only, @@ -304,6 +313,7 @@ struct ceph_mds_request { int r_num_fwd; /* number of forward attempts */ int r_resend_mds; /* mds to resend to next, if any*/ u32 r_sent_on_mseq; /* cap mseq request was sent at*/ + u64 r_deleg_ino; struct list_head r_wait; struct completion r_completion; @@ -315,10 +325,8 @@ struct ceph_mds_request { long long r_dir_release_cnt; long long r_dir_ordered_cnt; int r_readdir_cache_idx; - u32 r_readdir_offset; struct ceph_cap_reservation r_caps_reservation; - int r_num_caps; }; struct ceph_pool_perm { @@ -488,6 +496,7 @@ extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); +extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req); static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) { kref_get(&req->r_kref); @@ -537,4 +546,15 @@ extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, extern int ceph_trim_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, int max_caps); + +static inline int ceph_wait_on_async_create(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT, + TASK_INTERRUPTIBLE); +} + +extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session); +extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino); #endif diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c7f150686a53..c9784eb1159a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -155,6 +155,7 @@ enum { Opt_acl, Opt_quotadf, Opt_copyfrom, + Opt_wsync, }; enum ceph_recover_session_mode { @@ -194,6 +195,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_string ("snapdirname", Opt_snapdirname), fsparam_string ("source", Opt_source), fsparam_u32 ("wsize", Opt_wsize), + fsparam_flag_no ("wsync", Opt_wsync), {} }; @@ -444,6 +446,12 @@ static int ceph_parse_mount_param(struct fs_context *fc, fc->sb_flags &= ~SB_POSIXACL; } break; + case Opt_wsync: + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_ASYNC_DIROPS; + else + fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS; + break; default: BUG(); } @@ -567,6 +575,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) seq_show_option(m, "recover_session", "clean"); + if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) + seq_puts(m, ",nowsync"); + if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%u", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) @@ -729,6 +740,7 @@ struct kmem_cache *ceph_cap_flush_cachep; struct kmem_cache *ceph_dentry_cachep; struct kmem_cache *ceph_file_cachep; struct kmem_cache *ceph_dir_file_cachep; +struct kmem_cache *ceph_mds_request_cachep; static void ceph_inode_init_once(void *foo) { @@ -769,6 +781,10 @@ static int __init init_caches(void) if (!ceph_dir_file_cachep) goto bad_dir_file; + ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, SLAB_MEM_SPREAD); + if (!ceph_mds_request_cachep) + goto bad_mds_req; + error = ceph_fscache_register(); if (error) goto bad_fscache; @@ -776,6 +792,8 @@ static int __init init_caches(void) return 0; bad_fscache: + kmem_cache_destroy(ceph_mds_request_cachep); +bad_mds_req: kmem_cache_destroy(ceph_dir_file_cachep); bad_dir_file: kmem_cache_destroy(ceph_file_cachep); @@ -804,6 +822,7 @@ static void destroy_caches(void) kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_file_cachep); kmem_cache_destroy(ceph_dir_file_cachep); + kmem_cache_destroy(ceph_mds_request_cachep); ceph_fscache_unregister(); } @@ -1107,6 +1126,15 @@ static void ceph_free_fc(struct fs_context *fc) static int ceph_reconfigure_fc(struct fs_context *fc) { + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + struct ceph_fs_client *fsc = ceph_sb_to_client(fc->root->d_sb); + + if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) + ceph_set_mount_opt(fsc, ASYNC_DIROPS); + else + ceph_clear_mount_opt(fsc, ASYNC_DIROPS); + sync_filesystem(fc->root->d_sb); return 0; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 037cdfb2ad4f..60aac3aee055 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -43,13 +43,16 @@ #define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */ #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ +#define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */ #define CEPH_MOUNT_OPT_DEFAULT \ (CEPH_MOUNT_OPT_DCACHE | \ CEPH_MOUNT_OPT_NOCOPYFROM) #define ceph_set_mount_opt(fsc, opt) \ - (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; + (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt +#define ceph_clear_mount_opt(fsc, opt) \ + (fsc)->mount_options->flags &= ~CEPH_MOUNT_OPT_##opt #define ceph_test_mount_opt(fsc, opt) \ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) @@ -170,9 +173,9 @@ struct ceph_cap { struct list_head caps_item; }; -#define CHECK_CAPS_NODELAY 1 /* do not delay any further */ -#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */ -#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */ +#define CHECK_CAPS_AUTHONLY 1 /* only check auth cap */ +#define CHECK_CAPS_FLUSH 2 /* flush any dirty caps */ +#define CHECK_CAPS_NOINVAL 4 /* don't invalidate pagecache */ struct ceph_cap_flush { u64 tid; @@ -284,6 +287,7 @@ struct ceph_dentry_info { #define CEPH_DENTRY_REFERENCED 1 #define CEPH_DENTRY_LEASE_LIST 2 #define CEPH_DENTRY_SHRINK_LIST 4 +#define CEPH_DENTRY_PRIMARY_LINK 8 struct ceph_inode_xattrs_info { /* @@ -315,13 +319,14 @@ struct ceph_inode_info { u64 i_inline_version; u32 i_time_warp_seq; - unsigned i_ceph_flags; + unsigned long i_ceph_flags; atomic64_t i_release_count; atomic64_t i_ordered_count; atomic64_t i_complete_seq[2]; struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; + struct ceph_file_layout i_cached_layout; // for async creates char *i_symlink; /* for dirs */ @@ -352,7 +357,6 @@ struct ceph_inode_info { struct ceph_cap_flush *i_prealloc_cap_flush; struct list_head i_cap_flush_list; wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ - unsigned long i_hold_caps_min; /* jiffies */ unsigned long i_hold_caps_max; /* jiffies */ struct list_head i_cap_delay_list; /* for delayed cap release to mds */ struct ceph_cap_reservation i_cap_migration_resv; @@ -361,6 +365,8 @@ struct ceph_inode_info { dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */ + unsigned long i_last_rd; + unsigned long i_last_wr; int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */ struct mutex i_truncate_mutex; @@ -375,7 +381,7 @@ struct ceph_inode_info { /* held references to caps */ int i_pin_ref; - int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref; + int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref, i_fx_ref; int i_wrbuffer_ref, i_wrbuffer_ref_head; atomic_t i_filelock_ref; atomic_t i_shared_gen; /* increment each time we get FILE_SHARED */ @@ -511,18 +517,18 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, * Ceph inode. */ #define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ -#define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */ #define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ #define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */ #define CEPH_I_POOL_RD (1 << 4) /* can read from pool */ #define CEPH_I_POOL_WR (1 << 5) /* can write to pool */ #define CEPH_I_SEC_INITED (1 << 6) /* security initialized */ -#define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */ -#define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */ -#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */ -#define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */ -#define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */ -#define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */ +#define CEPH_I_KICK_FLUSH (1 << 7) /* kick flushing caps */ +#define CEPH_I_FLUSH_SNAPS (1 << 8) /* need flush snapss */ +#define CEPH_I_ERROR_WRITE (1 << 9) /* have seen write errors */ +#define CEPH_I_ERROR_FILELOCK (1 << 10) /* have seen file lock errors */ +#define CEPH_I_ODIRECT (1 << 11) /* inode in direct I/O mode */ +#define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */ +#define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT) /* * Masks of ceph inode work. @@ -674,18 +680,12 @@ extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask); extern int __ceph_caps_used(struct ceph_inode_info *ci); -extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci); - -/* - * wanted, by virtue of open file modes AND cap refs (buffered/cached data) - */ -static inline int __ceph_caps_wanted(struct ceph_inode_info *ci) +static inline bool __ceph_is_file_opened(struct ceph_inode_info *ci) { - int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); - if (w & CEPH_CAP_FILE_BUFFER) - w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */ - return w; + return ci->i_nr_by_mode[0]; } +extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci); +extern int __ceph_caps_wanted(struct ceph_inode_info *ci); /* what the mds thinks we want */ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check); @@ -899,6 +899,9 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) } /* inode.c */ +struct ceph_mds_reply_info_in; +struct ceph_mds_reply_dirfrag; + extern const struct inode_operations ceph_file_iops; extern struct inode *ceph_alloc_inode(struct super_block *sb); @@ -914,6 +917,11 @@ extern void ceph_fill_file_time(struct inode *inode, int issued, u64 time_warp_seq, struct timespec64 *ctime, struct timespec64 *mtime, struct timespec64 *atime); +extern int ceph_fill_inode(struct inode *inode, struct page *locked_page, + struct ceph_mds_reply_info_in *iinfo, + struct ceph_mds_reply_dirfrag *dirinfo, + struct ceph_mds_session *session, int cap_fmode, + struct ceph_cap_reservation *caps_reservation); extern int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req); extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, @@ -1042,7 +1050,7 @@ extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx); extern void ceph_add_cap(struct inode *inode, struct ceph_mds_session *session, u64 cap_id, - int fmode, unsigned issued, unsigned wanted, + unsigned issued, unsigned wanted, unsigned cap, unsigned seq, u64 realmino, int flags, struct ceph_cap **new_cap); extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); @@ -1058,8 +1066,12 @@ extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); +void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, + struct ceph_inode_info *ci); extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds); +extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps, + bool snap_rwsem_locked); extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, @@ -1084,8 +1096,10 @@ extern int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got); /* for counting open files by mode */ -extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode); -extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); +extern void ceph_get_fmode(struct ceph_inode_info *ci, int mode, int count); +extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode, int count); +extern void __ceph_touch_fmode(struct ceph_inode_info *ci, + struct ceph_mds_client *mdsc, int fmode); /* addr.c */ extern const struct address_space_operations ceph_aops; @@ -1097,7 +1111,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); /* file.c */ extern const struct file_operations ceph_file_fops; -extern int ceph_renew_caps(struct inode *inode); +extern int ceph_renew_caps(struct inode *inode, int fmode); extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode); @@ -1038,50 +1038,43 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, return ret; } -static bool dax_range_is_aligned(struct block_device *bdev, - unsigned int offset, unsigned int length) +int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size, + struct iomap *iomap) { - unsigned short sector_size = bdev_logical_block_size(bdev); + sector_t sector = iomap_sector(iomap, pos & PAGE_MASK); + pgoff_t pgoff; + long rc, id; + void *kaddr; + bool page_aligned = false; - if (!IS_ALIGNED(offset, sector_size)) - return false; - if (!IS_ALIGNED(length, sector_size)) - return false; - return true; -} + if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) && + IS_ALIGNED(size, PAGE_SIZE)) + page_aligned = true; -int __dax_zero_page_range(struct block_device *bdev, - struct dax_device *dax_dev, sector_t sector, - unsigned int offset, unsigned int size) -{ - if (dax_range_is_aligned(bdev, offset, size)) { - sector_t start_sector = sector + (offset >> 9); + rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff); + if (rc) + return rc; - return blkdev_issue_zeroout(bdev, start_sector, - size >> 9, GFP_NOFS, 0); - } else { - pgoff_t pgoff; - long rc, id; - void *kaddr; + id = dax_read_lock(); - rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff); - if (rc) - return rc; + if (page_aligned) + rc = dax_zero_page_range(iomap->dax_dev, pgoff, + size >> PAGE_SHIFT); + else + rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL); + if (rc < 0) { + dax_read_unlock(id); + return rc; + } - id = dax_read_lock(); - rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); - if (rc < 0) { - dax_read_unlock(id); - return rc; - } + if (!page_aligned) { memset(kaddr + offset, 0, size); - dax_flush(dax_dev, kaddr + offset, size); - dax_read_unlock(id); + dax_flush(iomap->dax_dev, kaddr + offset, size); } + dax_read_unlock(id); return 0; } -EXPORT_SYMBOL_GPL(__dax_zero_page_range); static loff_t dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index f34757e8f25f..2d357680094c 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include <linux/atomic.h> #include <linux/device.h> +#include <linux/pm_runtime.h> #include <linux/poll.h> #include <linux/security.h> @@ -1070,7 +1071,14 @@ static int debugfs_show_regset32(struct seq_file *s, void *data) { struct debugfs_regset32 *regset = s->private; + if (regset->dev) + pm_runtime_get_sync(regset->dev); + debugfs_print_regs32(s, regset->regs, regset->nregs, regset->base, ""); + + if (regset->dev) + pm_runtime_put(regset->dev); + return 0; } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index eee3c92a9ebf..8c596641a72b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -218,13 +218,18 @@ struct eventpoll { struct file *file; /* used to optimize loop detection check */ - int visited; struct list_head visited_list_link; + int visited; #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; #endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* tracks wakeup nests for lockdep validation */ + u8 nests; +#endif }; /* Wait structure used by the poll hooks */ @@ -545,30 +550,47 @@ out_unlock: */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -static DEFINE_PER_CPU(int, wakeup_nest); - -static void ep_poll_safewake(wait_queue_head_t *wq) +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi) { + struct eventpoll *ep_src; unsigned long flags; - int subclass; + u8 nests = 0; - local_irq_save(flags); - preempt_disable(); - subclass = __this_cpu_read(wakeup_nest); - spin_lock_nested(&wq->lock, subclass + 1); - __this_cpu_inc(wakeup_nest); - wake_up_locked_poll(wq, POLLIN); - __this_cpu_dec(wakeup_nest); - spin_unlock(&wq->lock); - local_irq_restore(flags); - preempt_enable(); + /* + * To set the subclass or nesting level for spin_lock_irqsave_nested() + * it might be natural to create a per-cpu nest count. However, since + * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can + * schedule() in the -rt kernel, the per-cpu variable are no longer + * protected. Thus, we are introducing a per eventpoll nest field. + * If we are not being call from ep_poll_callback(), epi is NULL and + * we are at the first level of nesting, 0. Otherwise, we are being + * called from ep_poll_callback() and if a previous wakeup source is + * not an epoll file itself, we are at depth 1 since the wakeup source + * is depth 0. If the wakeup source is a previous epoll file in the + * wakeup chain then we use its nests value and record ours as + * nests + 1. The previous epoll file nests value is stable since its + * already holding its own poll_wait.lock. + */ + if (epi) { + if ((is_file_epoll(epi->ffd.file))) { + ep_src = epi->ffd.file->private_data; + nests = ep_src->nests; + } else { + nests = 1; + } + } + spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); + ep->nests = nests + 1; + wake_up_locked_poll(&ep->poll_wait, EPOLLIN); + ep->nests = 0; + spin_unlock_irqrestore(&ep->poll_wait.lock, flags); } #else -static void ep_poll_safewake(wait_queue_head_t *wq) +static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi) { - wake_up_poll(wq, EPOLLIN); + wake_up_poll(&ep->poll_wait, EPOLLIN); } #endif @@ -789,7 +811,7 @@ static void ep_free(struct eventpoll *ep) /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, NULL); /* * We need to lock this because we could be hit by @@ -1258,7 +1280,7 @@ out_unlock: /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, epi); if (!(epi->event.events & EPOLLEXCLUSIVE)) ewake = 1; @@ -1562,7 +1584,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, NULL); return 0; @@ -1666,7 +1688,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, /* We have to call this outside the lock */ if (pwake) - ep_poll_safewake(&ep->poll_wait); + ep_poll_safewake(ep, NULL); return 0; } diff --git a/fs/exec.c b/fs/exec.c index 688c824cdac8..06b4c550af5d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1036,16 +1036,26 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) } EXPORT_SYMBOL(read_code); +/* + * Maps the mm_struct mm into the current task struct. + * On success, this function returns with the mutex + * exec_update_mutex locked. + */ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; + int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; exec_mm_release(tsk, old_mm); + ret = mutex_lock_killable(&tsk->signal->exec_update_mutex); + if (ret) + return ret; + if (old_mm) { sync_mm_rss(old_mm); /* @@ -1057,9 +1067,11 @@ static int exec_mmap(struct mm_struct *mm) down_read(&old_mm->mmap_sem); if (unlikely(old_mm->core_state)) { up_read(&old_mm->mmap_sem); + mutex_unlock(&tsk->signal->exec_update_mutex); return -EINTR; } } + task_lock(tsk); active_mm = tsk->active_mm; membarrier_exec_mmap(mm); @@ -1215,10 +1227,22 @@ no_thread_group: /* we have changed execution domain */ tsk->exit_signal = SIGCHLD; -#ifdef CONFIG_POSIX_TIMERS - exit_itimers(sig); - flush_itimer_signals(); -#endif + BUG_ON(!thread_group_leader(tsk)); + return 0; + +killed: + /* protects against exit_notify() and __exit_signal() */ + read_lock(&tasklist_lock); + sig->group_exit_task = NULL; + sig->notify_count = 0; + read_unlock(&tasklist_lock); + return -EAGAIN; +} + + +static int unshare_sighand(struct task_struct *me) +{ + struct sighand_struct *oldsighand = me->sighand; if (refcount_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; @@ -1236,23 +1260,13 @@ no_thread_group: write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); - rcu_assign_pointer(tsk->sighand, newsighand); + rcu_assign_pointer(me->sighand, newsighand); spin_unlock(&oldsighand->siglock); write_unlock_irq(&tasklist_lock); __cleanup_sighand(oldsighand); } - - BUG_ON(!thread_group_leader(tsk)); return 0; - -killed: - /* protects against exit_notify() and __exit_signal() */ - read_lock(&tasklist_lock); - sig->group_exit_task = NULL; - sig->notify_count = 0; - read_unlock(&tasklist_lock); - return -EAGAIN; } char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) @@ -1286,13 +1300,13 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) */ int flush_old_exec(struct linux_binprm * bprm) { + struct task_struct *me = current; int retval; /* - * Make sure we have a private signal table and that - * we are unassociated from the previous thread group. + * Make this the only thread in the thread group. */ - retval = de_thread(current); + retval = de_thread(me); if (retval) goto out; @@ -1312,18 +1326,31 @@ int flush_old_exec(struct linux_binprm * bprm) goto out; /* - * After clearing bprm->mm (to mark that current is using the - * prepared mm now), we have nothing left of the original + * After setting bprm->called_exec_mmap (to mark that current is + * using the prepared mm now), we have nothing left of the original * process. If anything from here on returns an error, the check * in search_binary_handler() will SEGV current. */ + bprm->called_exec_mmap = 1; bprm->mm = NULL; +#ifdef CONFIG_POSIX_TIMERS + exit_itimers(me->signal); + flush_itimer_signals(); +#endif + + /* + * Make the signal table private. + */ + retval = unshare_sighand(me); + if (retval) + goto out; + set_fs(USER_DS); - current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | + me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); - current->personality &= ~bprm->per_clear; + me->personality &= ~bprm->per_clear; /* * We have to apply CLOEXEC before we change whether the process is @@ -1331,7 +1358,7 @@ int flush_old_exec(struct linux_binprm * bprm) * trying to access the should-be-closed file descriptors of a process * undergoing exec(2). */ - do_close_on_exec(current->files); + do_close_on_exec(me->files); return 0; out: @@ -1412,7 +1439,7 @@ void setup_new_exec(struct linux_binprm * bprm) /* An exec changes our domain. We are no longer part of the thread group */ - current->self_exec_id++; + WRITE_ONCE(current->self_exec_id, current->self_exec_id + 1); flush_signal_handlers(current, 0); } EXPORT_SYMBOL(setup_new_exec); @@ -1450,6 +1477,8 @@ static void free_bprm(struct linux_binprm *bprm) { free_arg_pages(bprm); if (bprm->cred) { + if (bprm->called_exec_mmap) + mutex_unlock(¤t->signal->exec_update_mutex); mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } @@ -1499,6 +1528,7 @@ void install_exec_creds(struct linux_binprm *bprm) * credentials; any time after this it may be unlocked. */ security_bprm_committed_creds(bprm); + mutex_unlock(¤t->signal->exec_update_mutex); mutex_unlock(¤t->signal->cred_guard_mutex); } EXPORT_SYMBOL(install_exec_creds); @@ -1690,7 +1720,7 @@ int search_binary_handler(struct linux_binprm *bprm) read_lock(&binfmt_lock); put_binfmt(fmt); - if (retval < 0 && !bprm->mm) { + if (retval < 0 && bprm->called_exec_mmap) { /* we got to flush_old_exec() and failed after it */ read_unlock(&binfmt_lock); force_sigsegv(SIGSEGV); diff --git a/fs/exfat/Kconfig b/fs/exfat/Kconfig new file mode 100644 index 000000000000..2d3636dc5b8c --- /dev/null +++ b/fs/exfat/Kconfig @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +config EXFAT_FS + tristate "exFAT filesystem support" + select NLS + help + This allows you to mount devices formatted with the exFAT file system. + exFAT is typically used on SD-Cards or USB sticks. + + To compile this as a module, choose M here: the module will be called + exfat. + +config EXFAT_DEFAULT_IOCHARSET + string "Default iocharset for exFAT" + default "utf8" + depends on EXFAT_FS + help + Set this to the default input/output character set to use for + converting between the encoding is used for user visible filename and + UTF-16 character that exfat filesystem use, and can be overridden with + the "iocharset" mount option for exFAT filesystems. diff --git a/fs/exfat/Makefile b/fs/exfat/Makefile new file mode 100644 index 000000000000..ed51926a4971 --- /dev/null +++ b/fs/exfat/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# Makefile for the linux exFAT filesystem support. +# +obj-$(CONFIG_EXFAT_FS) += exfat.o + +exfat-y := inode.o namei.o dir.o super.o fatent.o cache.o nls.o misc.o \ + file.o balloc.o diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c new file mode 100644 index 000000000000..6a04cc02565a --- /dev/null +++ b/fs/exfat/balloc.c @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/blkdev.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +static const unsigned char free_bit[] = { + 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2,/* 0 ~ 19*/ + 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3,/* 20 ~ 39*/ + 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,/* 40 ~ 59*/ + 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,/* 60 ~ 79*/ + 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2,/* 80 ~ 99*/ + 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3,/*100 ~ 119*/ + 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,/*120 ~ 139*/ + 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5,/*140 ~ 159*/ + 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2,/*160 ~ 179*/ + 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3,/*180 ~ 199*/ + 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,/*200 ~ 219*/ + 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,/*220 ~ 239*/ + 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 /*240 ~ 254*/ +}; + +static const unsigned char used_bit[] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3,/* 0 ~ 19*/ + 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4,/* 20 ~ 39*/ + 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5,/* 40 ~ 59*/ + 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,/* 60 ~ 79*/ + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4,/* 80 ~ 99*/ + 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,/*100 ~ 119*/ + 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4,/*120 ~ 139*/ + 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,/*140 ~ 159*/ + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5,/*160 ~ 179*/ + 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5,/*180 ~ 199*/ + 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6,/*200 ~ 219*/ + 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,/*220 ~ 239*/ + 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 /*240 ~ 255*/ +}; + +/* + * Allocation Bitmap Management Functions + */ +static int exfat_allocate_bitmap(struct super_block *sb, + struct exfat_dentry *ep) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + long long map_size; + unsigned int i, need_map_size; + sector_t sector; + + sbi->map_clu = le32_to_cpu(ep->dentry.bitmap.start_clu); + map_size = le64_to_cpu(ep->dentry.bitmap.size); + need_map_size = ((EXFAT_DATA_CLUSTER_COUNT(sbi) - 1) / BITS_PER_BYTE) + + 1; + if (need_map_size != map_size) { + exfat_msg(sb, KERN_ERR, + "bogus allocation bitmap size(need : %u, cur : %lld)", + need_map_size, map_size); + /* + * Only allowed when bogus allocation + * bitmap size is large + */ + if (need_map_size > map_size) + return -EIO; + } + sbi->map_sectors = ((need_map_size - 1) >> + (sb->s_blocksize_bits)) + 1; + sbi->vol_amap = kmalloc_array(sbi->map_sectors, + sizeof(struct buffer_head *), GFP_KERNEL); + if (!sbi->vol_amap) + return -ENOMEM; + + sector = exfat_cluster_to_sector(sbi, sbi->map_clu); + for (i = 0; i < sbi->map_sectors; i++) { + sbi->vol_amap[i] = sb_bread(sb, sector + i); + if (!sbi->vol_amap[i]) { + /* release all buffers and free vol_amap */ + int j = 0; + + while (j < i) + brelse(sbi->vol_amap[j++]); + + kfree(sbi->vol_amap); + sbi->vol_amap = NULL; + return -EIO; + } + } + + sbi->pbr_bh = NULL; + return 0; +} + +int exfat_load_bitmap(struct super_block *sb) +{ + unsigned int i, type; + struct exfat_chain clu; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + exfat_chain_set(&clu, sbi->root_dir, 0, ALLOC_FAT_CHAIN); + while (clu.dir != EXFAT_EOF_CLUSTER) { + for (i = 0; i < sbi->dentries_per_clu; i++) { + struct exfat_dentry *ep; + struct buffer_head *bh; + + ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + if (!ep) + return -EIO; + + type = exfat_get_entry_type(ep); + if (type == TYPE_UNUSED) + break; + if (type != TYPE_BITMAP) + continue; + if (ep->dentry.bitmap.flags == 0x0) { + int err; + + err = exfat_allocate_bitmap(sb, ep); + brelse(bh); + return err; + } + brelse(bh); + } + + if (exfat_get_next_cluster(sb, &clu.dir)) + return -EIO; + } + + return -EINVAL; +} + +void exfat_free_bitmap(struct exfat_sb_info *sbi) +{ + int i; + + brelse(sbi->pbr_bh); + + for (i = 0; i < sbi->map_sectors; i++) + __brelse(sbi->vol_amap[i]); + + kfree(sbi->vol_amap); +} + +/* + * If the value of "clu" is 0, it means cluster 2 which is the first cluster of + * the cluster heap. + */ +int exfat_set_bitmap(struct inode *inode, unsigned int clu) +{ + int i, b; + unsigned int ent_idx; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + WARN_ON(clu < EXFAT_FIRST_CLUSTER); + ent_idx = CLUSTER_TO_BITMAP_ENT(clu); + i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx); + b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx); + + set_bit_le(b, sbi->vol_amap[i]->b_data); + exfat_update_bh(sb, sbi->vol_amap[i], IS_DIRSYNC(inode)); + return 0; +} + +/* + * If the value of "clu" is 0, it means cluster 2 which is the first cluster of + * the cluster heap. + */ +void exfat_clear_bitmap(struct inode *inode, unsigned int clu) +{ + int i, b; + unsigned int ent_idx; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_mount_options *opts = &sbi->options; + + WARN_ON(clu < EXFAT_FIRST_CLUSTER); + ent_idx = CLUSTER_TO_BITMAP_ENT(clu); + i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx); + b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx); + + clear_bit_le(b, sbi->vol_amap[i]->b_data); + exfat_update_bh(sb, sbi->vol_amap[i], IS_DIRSYNC(inode)); + + if (opts->discard) { + int ret_discard; + + ret_discard = sb_issue_discard(sb, + exfat_cluster_to_sector(sbi, clu + + EXFAT_RESERVED_CLUSTERS), + (1 << sbi->sect_per_clus_bits), GFP_NOFS, 0); + + if (ret_discard == -EOPNOTSUPP) { + exfat_msg(sb, KERN_ERR, + "discard not supported by device, disabling"); + opts->discard = 0; + } + } +} + +/* + * If the value of "clu" is 0, it means cluster 2 which is the first cluster of + * the cluster heap. + */ +unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu) +{ + unsigned int i, map_i, map_b, ent_idx; + unsigned int clu_base, clu_free; + unsigned char k, clu_mask; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + WARN_ON(clu < EXFAT_FIRST_CLUSTER); + ent_idx = CLUSTER_TO_BITMAP_ENT(clu); + clu_base = BITMAP_ENT_TO_CLUSTER(ent_idx & ~(BITS_PER_BYTE_MASK)); + clu_mask = IGNORED_BITS_REMAINED(clu, clu_base); + + map_i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx); + map_b = BITMAP_OFFSET_BYTE_IN_SECTOR(sb, ent_idx); + + for (i = EXFAT_FIRST_CLUSTER; i < sbi->num_clusters; + i += BITS_PER_BYTE) { + k = *(sbi->vol_amap[map_i]->b_data + map_b); + if (clu_mask > 0) { + k |= clu_mask; + clu_mask = 0; + } + if (k < 0xFF) { + clu_free = clu_base + free_bit[k]; + if (clu_free < sbi->num_clusters) + return clu_free; + } + clu_base += BITS_PER_BYTE; + + if (++map_b >= sb->s_blocksize || + clu_base >= sbi->num_clusters) { + if (++map_i >= sbi->map_sectors) { + clu_base = EXFAT_FIRST_CLUSTER; + map_i = 0; + } + map_b = 0; + } + } + + return EXFAT_EOF_CLUSTER; +} + +int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + unsigned int count = 0; + unsigned int i, map_i = 0, map_b = 0; + unsigned int total_clus = EXFAT_DATA_CLUSTER_COUNT(sbi); + unsigned int last_mask = total_clus & BITS_PER_BYTE_MASK; + unsigned char clu_bits; + const unsigned char last_bit_mask[] = {0, 0b00000001, 0b00000011, + 0b00000111, 0b00001111, 0b00011111, 0b00111111, 0b01111111}; + + total_clus &= ~last_mask; + for (i = 0; i < total_clus; i += BITS_PER_BYTE) { + clu_bits = *(sbi->vol_amap[map_i]->b_data + map_b); + count += used_bit[clu_bits]; + if (++map_b >= (unsigned int)sb->s_blocksize) { + map_i++; + map_b = 0; + } + } + + if (last_mask) { + clu_bits = *(sbi->vol_amap[map_i]->b_data + map_b); + clu_bits &= last_bit_mask[last_mask]; + count += used_bit[clu_bits]; + } + + *ret_count = count; + return 0; +} diff --git a/fs/exfat/cache.c b/fs/exfat/cache.c new file mode 100644 index 000000000000..03d0824fc368 --- /dev/null +++ b/fs/exfat/cache.c @@ -0,0 +1,325 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * linux/fs/fat/cache.c + * + * Written 1992,1993 by Werner Almesberger + * + * Mar 1999. AV. Changed cache, so that it uses the starting cluster instead + * of inode number. + * May 1999. AV. Fixed the bogosity with FAT32 (read "FAT28"). Fscking lusers. + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/slab.h> +#include <asm/unaligned.h> +#include <linux/buffer_head.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +#define EXFAT_CACHE_VALID 0 +#define EXFAT_MAX_CACHE 16 + +struct exfat_cache { + struct list_head cache_list; + unsigned int nr_contig; /* number of contiguous clusters */ + unsigned int fcluster; /* cluster number in the file. */ + unsigned int dcluster; /* cluster number on disk. */ +}; + +struct exfat_cache_id { + unsigned int id; + unsigned int nr_contig; + unsigned int fcluster; + unsigned int dcluster; +}; + +static struct kmem_cache *exfat_cachep; + +static void exfat_cache_init_once(void *c) +{ + struct exfat_cache *cache = (struct exfat_cache *)c; + + INIT_LIST_HEAD(&cache->cache_list); +} + +int exfat_cache_init(void) +{ + exfat_cachep = kmem_cache_create("exfat_cache", + sizeof(struct exfat_cache), + 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + exfat_cache_init_once); + if (!exfat_cachep) + return -ENOMEM; + return 0; +} + +void exfat_cache_shutdown(void) +{ + if (!exfat_cachep) + return; + kmem_cache_destroy(exfat_cachep); +} + +void exfat_cache_init_inode(struct inode *inode) +{ + struct exfat_inode_info *ei = EXFAT_I(inode); + + spin_lock_init(&ei->cache_lru_lock); + ei->nr_caches = 0; + ei->cache_valid_id = EXFAT_CACHE_VALID + 1; + INIT_LIST_HEAD(&ei->cache_lru); +} + +static inline struct exfat_cache *exfat_cache_alloc(void) +{ + return kmem_cache_alloc(exfat_cachep, GFP_NOFS); +} + +static inline void exfat_cache_free(struct exfat_cache *cache) +{ + WARN_ON(!list_empty(&cache->cache_list)); + kmem_cache_free(exfat_cachep, cache); +} + +static inline void exfat_cache_update_lru(struct inode *inode, + struct exfat_cache *cache) +{ + struct exfat_inode_info *ei = EXFAT_I(inode); + + if (ei->cache_lru.next != &cache->cache_list) + list_move(&cache->cache_list, &ei->cache_lru); +} + +static unsigned int exfat_cache_lookup(struct inode *inode, + unsigned int fclus, struct exfat_cache_id *cid, + unsigned int *cached_fclus, unsigned int *cached_dclus) +{ + struct exfat_inode_info *ei = EXFAT_I(inode); + static struct exfat_cache nohit = { .fcluster = 0, }; + struct exfat_cache *hit = &nohit, *p; + unsigned int offset = EXFAT_EOF_CLUSTER; + + spin_lock(&ei->cache_lru_lock); + list_for_each_entry(p, &ei->cache_lru, cache_list) { + /* Find the cache of "fclus" or nearest cache. */ + if (p->fcluster <= fclus && hit->fcluster < p->fcluster) { + hit = p; + if (hit->fcluster + hit->nr_contig < fclus) { + offset = hit->nr_contig; + } else { + offset = fclus - hit->fcluster; + break; + } + } + } + if (hit != &nohit) { + exfat_cache_update_lru(inode, hit); + + cid->id = ei->cache_valid_id; + cid->nr_contig = hit->nr_contig; + cid->fcluster = hit->fcluster; + cid->dcluster = hit->dcluster; + *cached_fclus = cid->fcluster + offset; + *cached_dclus = cid->dcluster + offset; + } + spin_unlock(&ei->cache_lru_lock); + + return offset; +} + +static struct exfat_cache *exfat_cache_merge(struct inode *inode, + struct exfat_cache_id *new) +{ + struct exfat_inode_info *ei = EXFAT_I(inode); + struct exfat_cache *p; + + list_for_each_entry(p, &ei->cache_lru, cache_list) { + /* Find the same part as "new" in cluster-chain. */ + if (p->fcluster == new->fcluster) { + if (new->nr_contig > p->nr_contig) + p->nr_contig = new->nr_contig; + return p; + } + } + return NULL; +} + +static void exfat_cache_add(struct inode *inode, + struct exfat_cache_id *new) +{ + struct exfat_inode_info *ei = EXFAT_I(inode); + struct exfat_cache *cache, *tmp; + + if (new->fcluster == EXFAT_EOF_CLUSTER) /* dummy cache */ + return; + + spin_lock(&ei->cache_lru_lock); + if (new->id != EXFAT_CACHE_VALID && + new->id != ei->cache_valid_id) + goto unlock; /* this cache was invalidated */ + + cache = exfat_cache_merge(inode, new); + if (cache == NULL) { + if (ei->nr_caches < EXFAT_MAX_CACHE) { + ei->nr_caches++; + spin_unlock(&ei->cache_lru_lock); + + tmp = exfat_cache_alloc(); + if (!tmp) { + spin_lock(&ei->cache_lru_lock); + ei->nr_caches--; + spin_unlock(&ei->cache_lru_lock); + return; + } + + spin_lock(&ei->cache_lru_lock); + cache = exfat_cache_merge(inode, new); + if (cache != NULL) { + ei->nr_caches--; + exfat_cache_free(tmp); + goto out_update_lru; + } + cache = tmp; + } else { + struct list_head *p = ei->cache_lru.prev; + + cache = list_entry(p, + struct exfat_cache, cache_list); + } + cache->fcluster = new->fcluster; + cache->dcluster = new->dcluster; + cache->nr_contig = new->nr_contig; + } +out_update_lru: + exfat_cache_update_lru(inode, cache); +unlock: + spin_unlock(&ei->cache_lru_lock); +} + +/* + * Cache invalidation occurs rarely, thus the LRU chain is not updated. It + * fixes itself after a while. + */ +static void __exfat_cache_inval_inode(struct inode *inode) +{ + struct exfat_inode_info *ei = EXFAT_I(inode); + struct exfat_cache *cache; + + while (!list_empty(&ei->cache_lru)) { + cache = list_entry(ei->cache_lru.next, + struct exfat_cache, cache_list); + list_del_init(&cache->cache_list); + ei->nr_caches--; + exfat_cache_free(cache); + } + /* Update. The copy of caches before this id is discarded. */ + ei->cache_valid_id++; + if (ei->cache_valid_id == EXFAT_CACHE_VALID) + ei->cache_valid_id++; +} + +void exfat_cache_inval_inode(struct inode *inode) +{ + struct exfat_inode_info *ei = EXFAT_I(inode); + + spin_lock(&ei->cache_lru_lock); + __exfat_cache_inval_inode(inode); + spin_unlock(&ei->cache_lru_lock); +} + +static inline int cache_contiguous(struct exfat_cache_id *cid, + unsigned int dclus) +{ + cid->nr_contig++; + return cid->dcluster + cid->nr_contig == dclus; +} + +static inline void cache_init(struct exfat_cache_id *cid, + unsigned int fclus, unsigned int dclus) +{ + cid->id = EXFAT_CACHE_VALID; + cid->fcluster = fclus; + cid->dcluster = dclus; + cid->nr_contig = 0; +} + +int exfat_get_cluster(struct inode *inode, unsigned int cluster, + unsigned int *fclus, unsigned int *dclus, + unsigned int *last_dclus, int allow_eof) +{ + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + unsigned int limit = sbi->num_clusters; + struct exfat_inode_info *ei = EXFAT_I(inode); + struct exfat_cache_id cid; + unsigned int content; + + if (ei->start_clu == EXFAT_FREE_CLUSTER) { + exfat_fs_error(sb, + "invalid access to exfat cache (entry 0x%08x)", + ei->start_clu); + return -EIO; + } + + *fclus = 0; + *dclus = ei->start_clu; + *last_dclus = *dclus; + + /* + * Don`t use exfat_cache if zero offset or non-cluster allocation + */ + if (cluster == 0 || *dclus == EXFAT_EOF_CLUSTER) + return 0; + + cache_init(&cid, EXFAT_EOF_CLUSTER, EXFAT_EOF_CLUSTER); + + if (exfat_cache_lookup(inode, cluster, &cid, fclus, dclus) == + EXFAT_EOF_CLUSTER) { + /* + * dummy, always not contiguous + * This is reinitialized by cache_init(), later. + */ + WARN_ON(cid.id != EXFAT_CACHE_VALID || + cid.fcluster != EXFAT_EOF_CLUSTER || + cid.dcluster != EXFAT_EOF_CLUSTER || + cid.nr_contig != 0); + } + + if (*fclus == cluster) + return 0; + + while (*fclus < cluster) { + /* prevent the infinite loop of cluster chain */ + if (*fclus > limit) { + exfat_fs_error(sb, + "detected the cluster chain loop (i_pos %u)", + (*fclus)); + return -EIO; + } + + if (exfat_ent_get(sb, *dclus, &content)) + return -EIO; + + *last_dclus = *dclus; + *dclus = content; + (*fclus)++; + + if (content == EXFAT_EOF_CLUSTER) { + if (!allow_eof) { + exfat_fs_error(sb, + "invalid cluster chain (i_pos %u, last_clus 0x%08x is EOF)", + *fclus, (*last_dclus)); + return -EIO; + } + + break; + } + + if (!cache_contiguous(&cid, *dclus)) + cache_init(&cid, *fclus, *dclus); + } + + exfat_cache_add(inode, &cid); + return 0; +} diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c new file mode 100644 index 000000000000..4b91afb0f051 --- /dev/null +++ b/fs/exfat/dir.c @@ -0,0 +1,1238 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/slab.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +static int exfat_extract_uni_name(struct exfat_dentry *ep, + unsigned short *uniname) +{ + int i, len = 0; + + for (i = 0; i < EXFAT_FILE_NAME_LEN; i++) { + *uniname = le16_to_cpu(ep->dentry.name.unicode_0_14[i]); + if (*uniname == 0x0) + return len; + uniname++; + len++; + } + + *uniname = 0x0; + return len; + +} + +static void exfat_get_uniname_from_ext_entry(struct super_block *sb, + struct exfat_chain *p_dir, int entry, unsigned short *uniname) +{ + int i; + struct exfat_dentry *ep; + struct exfat_entry_set_cache *es; + + es = exfat_get_dentry_set(sb, p_dir, entry, ES_ALL_ENTRIES, &ep); + if (!es) + return; + + if (es->num_entries < 3) + goto free_es; + + ep += 2; + + /* + * First entry : file entry + * Second entry : stream-extension entry + * Third entry : first file-name entry + * So, the index of first file-name dentry should start from 2. + */ + for (i = 2; i < es->num_entries; i++, ep++) { + /* end of name entry */ + if (exfat_get_entry_type(ep) != TYPE_EXTEND) + goto free_es; + + exfat_extract_uni_name(ep, uniname); + uniname += EXFAT_FILE_NAME_LEN; + } + +free_es: + kfree(es); +} + +/* read a directory entry from the opened directory */ +static int exfat_readdir(struct inode *inode, struct exfat_dir_entry *dir_entry) +{ + int i, dentries_per_clu, dentries_per_clu_bits = 0; + unsigned int type, clu_offset; + sector_t sector; + struct exfat_chain dir, clu; + struct exfat_uni_name uni_name; + struct exfat_dentry *ep; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(inode); + unsigned int dentry = ei->rwoffset & 0xFFFFFFFF; + struct buffer_head *bh; + + /* check if the given file ID is opened */ + if (ei->type != TYPE_DIR) + return -EPERM; + + if (ei->entry == -1) + exfat_chain_set(&dir, sbi->root_dir, 0, ALLOC_FAT_CHAIN); + else + exfat_chain_set(&dir, ei->start_clu, + EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags); + + dentries_per_clu = sbi->dentries_per_clu; + dentries_per_clu_bits = ilog2(dentries_per_clu); + + clu_offset = dentry >> dentries_per_clu_bits; + exfat_chain_dup(&clu, &dir); + + if (clu.flags == ALLOC_NO_FAT_CHAIN) { + clu.dir += clu_offset; + clu.size -= clu_offset; + } else { + /* hint_information */ + if (clu_offset > 0 && ei->hint_bmap.off != EXFAT_EOF_CLUSTER && + ei->hint_bmap.off > 0 && clu_offset >= ei->hint_bmap.off) { + clu_offset -= ei->hint_bmap.off; + clu.dir = ei->hint_bmap.clu; + } + + while (clu_offset > 0) { + if (exfat_get_next_cluster(sb, &(clu.dir))) + return -EIO; + + clu_offset--; + } + } + + while (clu.dir != EXFAT_EOF_CLUSTER) { + i = dentry & (dentries_per_clu - 1); + + for ( ; i < dentries_per_clu; i++, dentry++) { + ep = exfat_get_dentry(sb, &clu, i, &bh, §or); + if (!ep) + return -EIO; + + type = exfat_get_entry_type(ep); + if (type == TYPE_UNUSED) { + brelse(bh); + break; + } + + if (type != TYPE_FILE && type != TYPE_DIR) { + brelse(bh); + continue; + } + + dir_entry->attr = le16_to_cpu(ep->dentry.file.attr); + exfat_get_entry_time(sbi, &dir_entry->crtime, + ep->dentry.file.create_tz, + ep->dentry.file.create_time, + ep->dentry.file.create_date, + ep->dentry.file.create_time_ms); + exfat_get_entry_time(sbi, &dir_entry->mtime, + ep->dentry.file.modify_tz, + ep->dentry.file.modify_time, + ep->dentry.file.modify_date, + ep->dentry.file.modify_time_ms); + exfat_get_entry_time(sbi, &dir_entry->atime, + ep->dentry.file.access_tz, + ep->dentry.file.access_time, + ep->dentry.file.access_date, + 0); + + *uni_name.name = 0x0; + exfat_get_uniname_from_ext_entry(sb, &dir, dentry, + uni_name.name); + exfat_utf16_to_nls(sb, &uni_name, + dir_entry->namebuf.lfn, + dir_entry->namebuf.lfnbuf_len); + brelse(bh); + + ep = exfat_get_dentry(sb, &clu, i + 1, &bh, NULL); + if (!ep) + return -EIO; + dir_entry->size = + le64_to_cpu(ep->dentry.stream.valid_size); + brelse(bh); + + ei->hint_bmap.off = dentry >> dentries_per_clu_bits; + ei->hint_bmap.clu = clu.dir; + + ei->rwoffset = ++dentry; + return 0; + } + + if (clu.flags == ALLOC_NO_FAT_CHAIN) { + if (--clu.size > 0) + clu.dir++; + else + clu.dir = EXFAT_EOF_CLUSTER; + } else { + if (exfat_get_next_cluster(sb, &(clu.dir))) + return -EIO; + } + } + + dir_entry->namebuf.lfn[0] = '\0'; + ei->rwoffset = dentry; + return 0; +} + +static void exfat_init_namebuf(struct exfat_dentry_namebuf *nb) +{ + nb->lfn = NULL; + nb->lfnbuf_len = 0; +} + +static int exfat_alloc_namebuf(struct exfat_dentry_namebuf *nb) +{ + nb->lfn = __getname(); + if (!nb->lfn) + return -ENOMEM; + nb->lfnbuf_len = MAX_VFSNAME_BUF_SIZE; + return 0; +} + +static void exfat_free_namebuf(struct exfat_dentry_namebuf *nb) +{ + if (!nb->lfn) + return; + + __putname(nb->lfn); + exfat_init_namebuf(nb); +} + +/* skip iterating emit_dots when dir is empty */ +#define ITER_POS_FILLED_DOTS (2) +static int exfat_iterate(struct file *filp, struct dir_context *ctx) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct inode *tmp; + struct exfat_dir_entry de; + struct exfat_dentry_namebuf *nb = &(de.namebuf); + struct exfat_inode_info *ei = EXFAT_I(inode); + unsigned long inum; + loff_t cpos, i_pos; + int err = 0, fake_offset = 0; + + exfat_init_namebuf(nb); + mutex_lock(&EXFAT_SB(sb)->s_lock); + + cpos = ctx->pos; + if (!dir_emit_dots(filp, ctx)) + goto unlock; + + if (ctx->pos == ITER_POS_FILLED_DOTS) { + cpos = 0; + fake_offset = 1; + } + + if (cpos & (DENTRY_SIZE - 1)) { + err = -ENOENT; + goto unlock; + } + + /* name buffer should be allocated before use */ + err = exfat_alloc_namebuf(nb); + if (err) + goto unlock; +get_new: + ei->rwoffset = EXFAT_B_TO_DEN(cpos); + + if (cpos >= i_size_read(inode)) + goto end_of_dir; + + err = exfat_readdir(inode, &de); + if (err) { + /* + * At least we tried to read a sector. Move cpos to next sector + * position (should be aligned). + */ + if (err == -EIO) { + cpos += 1 << (sb->s_blocksize_bits); + cpos &= ~(sb->s_blocksize - 1); + } + + err = -EIO; + goto end_of_dir; + } + + cpos = EXFAT_DEN_TO_B(ei->rwoffset); + + if (!nb->lfn[0]) + goto end_of_dir; + + i_pos = ((loff_t)ei->start_clu << 32) | + ((ei->rwoffset - 1) & 0xffffffff); + tmp = exfat_iget(sb, i_pos); + if (tmp) { + inum = tmp->i_ino; + iput(tmp); + } else { + inum = iunique(sb, EXFAT_ROOT_INO); + } + + /* + * Before calling dir_emit(), sb_lock should be released. + * Because page fault can occur in dir_emit() when the size + * of buffer given from user is larger than one page size. + */ + mutex_unlock(&EXFAT_SB(sb)->s_lock); + if (!dir_emit(ctx, nb->lfn, strlen(nb->lfn), inum, + (de.attr & ATTR_SUBDIR) ? DT_DIR : DT_REG)) + goto out_unlocked; + mutex_lock(&EXFAT_SB(sb)->s_lock); + ctx->pos = cpos; + goto get_new; + +end_of_dir: + if (!cpos && fake_offset) + cpos = ITER_POS_FILLED_DOTS; + ctx->pos = cpos; +unlock: + mutex_unlock(&EXFAT_SB(sb)->s_lock); +out_unlocked: + /* + * To improve performance, free namebuf after unlock sb_lock. + * If namebuf is not allocated, this function do nothing + */ + exfat_free_namebuf(nb); + return err; +} + +const struct file_operations exfat_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = exfat_iterate, + .fsync = generic_file_fsync, +}; + +int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu) +{ + int ret; + + exfat_chain_set(clu, EXFAT_EOF_CLUSTER, 0, ALLOC_NO_FAT_CHAIN); + + ret = exfat_alloc_cluster(inode, 1, clu); + if (ret) + return ret; + + return exfat_zeroed_cluster(inode, clu->dir); +} + +int exfat_calc_num_entries(struct exfat_uni_name *p_uniname) +{ + int len; + + len = p_uniname->name_len; + if (len == 0) + return -EINVAL; + + /* 1 file entry + 1 stream entry + name entries */ + return ((len - 1) / EXFAT_FILE_NAME_LEN + 3); +} + +unsigned int exfat_get_entry_type(struct exfat_dentry *ep) +{ + if (ep->type == EXFAT_UNUSED) + return TYPE_UNUSED; + if (IS_EXFAT_DELETED(ep->type)) + return TYPE_DELETED; + if (ep->type == EXFAT_INVAL) + return TYPE_INVALID; + if (IS_EXFAT_CRITICAL_PRI(ep->type)) { + if (ep->type == EXFAT_BITMAP) + return TYPE_BITMAP; + if (ep->type == EXFAT_UPCASE) + return TYPE_UPCASE; + if (ep->type == EXFAT_VOLUME) + return TYPE_VOLUME; + if (ep->type == EXFAT_FILE) { + if (le16_to_cpu(ep->dentry.file.attr) & ATTR_SUBDIR) + return TYPE_DIR; + return TYPE_FILE; + } + return TYPE_CRITICAL_PRI; + } + if (IS_EXFAT_BENIGN_PRI(ep->type)) { + if (ep->type == EXFAT_GUID) + return TYPE_GUID; + if (ep->type == EXFAT_PADDING) + return TYPE_PADDING; + if (ep->type == EXFAT_ACLTAB) + return TYPE_ACLTAB; + return TYPE_BENIGN_PRI; + } + if (IS_EXFAT_CRITICAL_SEC(ep->type)) { + if (ep->type == EXFAT_STREAM) + return TYPE_STREAM; + if (ep->type == EXFAT_NAME) + return TYPE_EXTEND; + if (ep->type == EXFAT_ACL) + return TYPE_ACL; + return TYPE_CRITICAL_SEC; + } + return TYPE_BENIGN_SEC; +} + +static void exfat_set_entry_type(struct exfat_dentry *ep, unsigned int type) +{ + if (type == TYPE_UNUSED) { + ep->type = EXFAT_UNUSED; + } else if (type == TYPE_DELETED) { + ep->type &= EXFAT_DELETE; + } else if (type == TYPE_STREAM) { + ep->type = EXFAT_STREAM; + } else if (type == TYPE_EXTEND) { + ep->type = EXFAT_NAME; + } else if (type == TYPE_BITMAP) { + ep->type = EXFAT_BITMAP; + } else if (type == TYPE_UPCASE) { + ep->type = EXFAT_UPCASE; + } else if (type == TYPE_VOLUME) { + ep->type = EXFAT_VOLUME; + } else if (type == TYPE_DIR) { + ep->type = EXFAT_FILE; + ep->dentry.file.attr = cpu_to_le16(ATTR_SUBDIR); + } else if (type == TYPE_FILE) { + ep->type = EXFAT_FILE; + ep->dentry.file.attr = cpu_to_le16(ATTR_ARCHIVE); + } +} + +static void exfat_init_stream_entry(struct exfat_dentry *ep, + unsigned char flags, unsigned int start_clu, + unsigned long long size) +{ + exfat_set_entry_type(ep, TYPE_STREAM); + ep->dentry.stream.flags = flags; + ep->dentry.stream.start_clu = cpu_to_le32(start_clu); + ep->dentry.stream.valid_size = cpu_to_le64(size); + ep->dentry.stream.size = cpu_to_le64(size); +} + +static void exfat_init_name_entry(struct exfat_dentry *ep, + unsigned short *uniname) +{ + int i; + + exfat_set_entry_type(ep, TYPE_EXTEND); + ep->dentry.name.flags = 0x0; + + for (i = 0; i < EXFAT_FILE_NAME_LEN; i++) { + ep->dentry.name.unicode_0_14[i] = cpu_to_le16(*uniname); + if (*uniname == 0x0) + break; + uniname++; + } +} + +int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir, + int entry, unsigned int type, unsigned int start_clu, + unsigned long long size) +{ + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct timespec64 ts = current_time(inode); + sector_t sector; + struct exfat_dentry *ep; + struct buffer_head *bh; + + /* + * We cannot use exfat_get_dentry_set here because file ep is not + * initialized yet. + */ + ep = exfat_get_dentry(sb, p_dir, entry, &bh, §or); + if (!ep) + return -EIO; + + exfat_set_entry_type(ep, type); + exfat_set_entry_time(sbi, &ts, + &ep->dentry.file.create_tz, + &ep->dentry.file.create_time, + &ep->dentry.file.create_date, + &ep->dentry.file.create_time_ms); + exfat_set_entry_time(sbi, &ts, + &ep->dentry.file.modify_tz, + &ep->dentry.file.modify_time, + &ep->dentry.file.modify_date, + &ep->dentry.file.modify_time_ms); + exfat_set_entry_time(sbi, &ts, + &ep->dentry.file.access_tz, + &ep->dentry.file.access_time, + &ep->dentry.file.access_date, + NULL); + + exfat_update_bh(sb, bh, IS_DIRSYNC(inode)); + brelse(bh); + + ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh, §or); + if (!ep) + return -EIO; + + exfat_init_stream_entry(ep, + (type == TYPE_FILE) ? ALLOC_FAT_CHAIN : ALLOC_NO_FAT_CHAIN, + start_clu, size); + exfat_update_bh(sb, bh, IS_DIRSYNC(inode)); + brelse(bh); + + return 0; +} + +int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir, + int entry) +{ + struct super_block *sb = inode->i_sb; + int ret = 0; + int i, num_entries; + sector_t sector; + unsigned short chksum; + struct exfat_dentry *ep, *fep; + struct buffer_head *fbh, *bh; + + fep = exfat_get_dentry(sb, p_dir, entry, &fbh, §or); + if (!fep) + return -EIO; + + num_entries = fep->dentry.file.num_ext + 1; + chksum = exfat_calc_chksum_2byte(fep, DENTRY_SIZE, 0, CS_DIR_ENTRY); + + for (i = 1; i < num_entries; i++) { + ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, NULL); + if (!ep) { + ret = -EIO; + goto release_fbh; + } + chksum = exfat_calc_chksum_2byte(ep, DENTRY_SIZE, chksum, + CS_DEFAULT); + brelse(bh); + } + + fep->dentry.file.checksum = cpu_to_le16(chksum); + exfat_update_bh(sb, fbh, IS_DIRSYNC(inode)); +release_fbh: + brelse(fbh); + return ret; +} + +int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir, + int entry, int num_entries, struct exfat_uni_name *p_uniname) +{ + struct super_block *sb = inode->i_sb; + int i; + sector_t sector; + unsigned short *uniname = p_uniname->name; + struct exfat_dentry *ep; + struct buffer_head *bh; + int sync = IS_DIRSYNC(inode); + + ep = exfat_get_dentry(sb, p_dir, entry, &bh, §or); + if (!ep) + return -EIO; + + ep->dentry.file.num_ext = (unsigned char)(num_entries - 1); + exfat_update_bh(sb, bh, sync); + brelse(bh); + + ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh, §or); + if (!ep) + return -EIO; + + ep->dentry.stream.name_len = p_uniname->name_len; + ep->dentry.stream.name_hash = cpu_to_le16(p_uniname->name_hash); + exfat_update_bh(sb, bh, sync); + brelse(bh); + + for (i = EXFAT_FIRST_CLUSTER; i < num_entries; i++) { + ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, §or); + if (!ep) + return -EIO; + + exfat_init_name_entry(ep, uniname); + exfat_update_bh(sb, bh, sync); + brelse(bh); + uniname += EXFAT_FILE_NAME_LEN; + } + + exfat_update_dir_chksum(inode, p_dir, entry); + return 0; +} + +int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir, + int entry, int order, int num_entries) +{ + struct super_block *sb = inode->i_sb; + int i; + sector_t sector; + struct exfat_dentry *ep; + struct buffer_head *bh; + + for (i = order; i < num_entries; i++) { + ep = exfat_get_dentry(sb, p_dir, entry + i, &bh, §or); + if (!ep) + return -EIO; + + exfat_set_entry_type(ep, TYPE_DELETED); + exfat_update_bh(sb, bh, IS_DIRSYNC(inode)); + brelse(bh); + } + + return 0; +} + +int exfat_update_dir_chksum_with_entry_set(struct super_block *sb, + struct exfat_entry_set_cache *es, int sync) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct buffer_head *bh; + sector_t sec = es->sector; + unsigned int off = es->offset; + int chksum_type = CS_DIR_ENTRY, i, num_entries = es->num_entries; + unsigned int buf_off = (off - es->offset); + unsigned int remaining_byte_in_sector, copy_entries, clu; + unsigned short chksum = 0; + + for (i = 0; i < num_entries; i++) { + chksum = exfat_calc_chksum_2byte(&es->entries[i], DENTRY_SIZE, + chksum, chksum_type); + chksum_type = CS_DEFAULT; + } + + es->entries[0].dentry.file.checksum = cpu_to_le16(chksum); + + while (num_entries) { + /* write per sector base */ + remaining_byte_in_sector = (1 << sb->s_blocksize_bits) - off; + copy_entries = min_t(int, + EXFAT_B_TO_DEN(remaining_byte_in_sector), + num_entries); + bh = sb_bread(sb, sec); + if (!bh) + goto err_out; + memcpy(bh->b_data + off, + (unsigned char *)&es->entries[0] + buf_off, + EXFAT_DEN_TO_B(copy_entries)); + exfat_update_bh(sb, bh, sync); + brelse(bh); + num_entries -= copy_entries; + + if (num_entries) { + /* get next sector */ + if (exfat_is_last_sector_in_cluster(sbi, sec)) { + clu = exfat_sector_to_cluster(sbi, sec); + if (es->alloc_flag == ALLOC_NO_FAT_CHAIN) + clu++; + else if (exfat_get_next_cluster(sb, &clu)) + goto err_out; + sec = exfat_cluster_to_sector(sbi, clu); + } else { + sec++; + } + off = 0; + buf_off += EXFAT_DEN_TO_B(copy_entries); + } + } + + return 0; +err_out: + return -EIO; +} + +static int exfat_walk_fat_chain(struct super_block *sb, + struct exfat_chain *p_dir, unsigned int byte_offset, + unsigned int *clu) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + unsigned int clu_offset; + unsigned int cur_clu; + + clu_offset = EXFAT_B_TO_CLU(byte_offset, sbi); + cur_clu = p_dir->dir; + + if (p_dir->flags == ALLOC_NO_FAT_CHAIN) { + cur_clu += clu_offset; + } else { + while (clu_offset > 0) { + if (exfat_get_next_cluster(sb, &cur_clu)) + return -EIO; + if (cur_clu == EXFAT_EOF_CLUSTER) { + exfat_fs_error(sb, + "invalid dentry access beyond EOF (clu : %u, eidx : %d)", + p_dir->dir, + EXFAT_B_TO_DEN(byte_offset)); + return -EIO; + } + clu_offset--; + } + } + + *clu = cur_clu; + return 0; +} + +int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir, + int entry, sector_t *sector, int *offset) +{ + int ret; + unsigned int off, clu = 0; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + off = EXFAT_DEN_TO_B(entry); + + ret = exfat_walk_fat_chain(sb, p_dir, off, &clu); + if (ret) + return ret; + + /* byte offset in cluster */ + off = EXFAT_CLU_OFFSET(off, sbi); + + /* byte offset in sector */ + *offset = EXFAT_BLK_OFFSET(off, sb); + + /* sector offset in cluster */ + *sector = EXFAT_B_TO_BLK(off, sb); + *sector += exfat_cluster_to_sector(sbi, clu); + return 0; +} + +#define EXFAT_MAX_RA_SIZE (128*1024) +static int exfat_dir_readahead(struct super_block *sb, sector_t sec) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct buffer_head *bh; + unsigned int max_ra_count = EXFAT_MAX_RA_SIZE >> sb->s_blocksize_bits; + unsigned int page_ra_count = PAGE_SIZE >> sb->s_blocksize_bits; + unsigned int adj_ra_count = max(sbi->sect_per_clus, page_ra_count); + unsigned int ra_count = min(adj_ra_count, max_ra_count); + + /* Read-ahead is not required */ + if (sbi->sect_per_clus == 1) + return 0; + + if (sec < sbi->data_start_sector) { + exfat_msg(sb, KERN_ERR, + "requested sector is invalid(sect:%llu, root:%llu)", + (unsigned long long)sec, sbi->data_start_sector); + return -EIO; + } + + /* Not sector aligned with ra_count, resize ra_count to page size */ + if ((sec - sbi->data_start_sector) & (ra_count - 1)) + ra_count = page_ra_count; + + bh = sb_find_get_block(sb, sec); + if (!bh || !buffer_uptodate(bh)) { + unsigned int i; + + for (i = 0; i < ra_count; i++) + sb_breadahead(sb, (sector_t)(sec + i)); + } + brelse(bh); + return 0; +} + +struct exfat_dentry *exfat_get_dentry(struct super_block *sb, + struct exfat_chain *p_dir, int entry, struct buffer_head **bh, + sector_t *sector) +{ + unsigned int dentries_per_page = EXFAT_B_TO_DEN(PAGE_SIZE); + int off; + sector_t sec; + + if (p_dir->dir == DIR_DELETED) { + exfat_msg(sb, KERN_ERR, "abnormal access to deleted dentry\n"); + return NULL; + } + + if (exfat_find_location(sb, p_dir, entry, &sec, &off)) + return NULL; + + if (p_dir->dir != EXFAT_FREE_CLUSTER && + !(entry & (dentries_per_page - 1))) + exfat_dir_readahead(sb, sec); + + *bh = sb_bread(sb, sec); + if (!*bh) + return NULL; + + if (sector) + *sector = sec; + return (struct exfat_dentry *)((*bh)->b_data + off); +} + +enum exfat_validate_dentry_mode { + ES_MODE_STARTED, + ES_MODE_GET_FILE_ENTRY, + ES_MODE_GET_STRM_ENTRY, + ES_MODE_GET_NAME_ENTRY, + ES_MODE_GET_CRITICAL_SEC_ENTRY, +}; + +static bool exfat_validate_entry(unsigned int type, + enum exfat_validate_dentry_mode *mode) +{ + if (type == TYPE_UNUSED || type == TYPE_DELETED) + return false; + + switch (*mode) { + case ES_MODE_STARTED: + if (type != TYPE_FILE && type != TYPE_DIR) + return false; + *mode = ES_MODE_GET_FILE_ENTRY; + return true; + case ES_MODE_GET_FILE_ENTRY: + if (type != TYPE_STREAM) + return false; + *mode = ES_MODE_GET_STRM_ENTRY; + return true; + case ES_MODE_GET_STRM_ENTRY: + if (type != TYPE_EXTEND) + return false; + *mode = ES_MODE_GET_NAME_ENTRY; + return true; + case ES_MODE_GET_NAME_ENTRY: + if (type == TYPE_STREAM) + return false; + if (type != TYPE_EXTEND) { + if (!(type & TYPE_CRITICAL_SEC)) + return false; + *mode = ES_MODE_GET_CRITICAL_SEC_ENTRY; + } + return true; + case ES_MODE_GET_CRITICAL_SEC_ENTRY: + if (type == TYPE_EXTEND || type == TYPE_STREAM) + return false; + if ((type & TYPE_CRITICAL_SEC) != TYPE_CRITICAL_SEC) + return false; + return true; + default: + WARN_ON_ONCE(1); + return false; + } +} + +/* + * Returns a set of dentries for a file or dir. + * + * Note that this is a copy (dump) of dentries so that user should + * call write_entry_set() to apply changes made in this entry set + * to the real device. + * + * in: + * sb+p_dir+entry: indicates a file/dir + * type: specifies how many dentries should be included. + * out: + * file_ep: will point the first dentry(= file dentry) on success + * return: + * pointer of entry set on success, + * NULL on failure. + */ +struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, + struct exfat_chain *p_dir, int entry, unsigned int type, + struct exfat_dentry **file_ep) +{ + int ret; + unsigned int off, byte_offset, clu = 0; + unsigned int entry_type; + sector_t sec; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_entry_set_cache *es; + struct exfat_dentry *ep, *pos; + unsigned char num_entries; + enum exfat_validate_dentry_mode mode = ES_MODE_STARTED; + struct buffer_head *bh; + + if (p_dir->dir == DIR_DELETED) { + exfat_msg(sb, KERN_ERR, "access to deleted dentry\n"); + return NULL; + } + + byte_offset = EXFAT_DEN_TO_B(entry); + ret = exfat_walk_fat_chain(sb, p_dir, byte_offset, &clu); + if (ret) + return NULL; + + /* byte offset in cluster */ + byte_offset = EXFAT_CLU_OFFSET(byte_offset, sbi); + + /* byte offset in sector */ + off = EXFAT_BLK_OFFSET(byte_offset, sb); + + /* sector offset in cluster */ + sec = EXFAT_B_TO_BLK(byte_offset, sb); + sec += exfat_cluster_to_sector(sbi, clu); + + bh = sb_bread(sb, sec); + if (!bh) + return NULL; + + ep = (struct exfat_dentry *)(bh->b_data + off); + entry_type = exfat_get_entry_type(ep); + + if (entry_type != TYPE_FILE && entry_type != TYPE_DIR) + goto release_bh; + + num_entries = type == ES_ALL_ENTRIES ? + ep->dentry.file.num_ext + 1 : type; + es = kmalloc(struct_size(es, entries, num_entries), GFP_KERNEL); + if (!es) + goto release_bh; + + es->num_entries = num_entries; + es->sector = sec; + es->offset = off; + es->alloc_flag = p_dir->flags; + + pos = &es->entries[0]; + + while (num_entries) { + if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode)) + goto free_es; + + /* copy dentry */ + memcpy(pos, ep, sizeof(struct exfat_dentry)); + + if (--num_entries == 0) + break; + + if (((off + DENTRY_SIZE) & (sb->s_blocksize - 1)) < + (off & (sb->s_blocksize - 1))) { + /* get the next sector */ + if (exfat_is_last_sector_in_cluster(sbi, sec)) { + if (es->alloc_flag == ALLOC_NO_FAT_CHAIN) + clu++; + else if (exfat_get_next_cluster(sb, &clu)) + goto free_es; + sec = exfat_cluster_to_sector(sbi, clu); + } else { + sec++; + } + + brelse(bh); + bh = sb_bread(sb, sec); + if (!bh) + goto free_es; + off = 0; + ep = (struct exfat_dentry *)bh->b_data; + } else { + ep++; + off += DENTRY_SIZE; + } + pos++; + } + + if (file_ep) + *file_ep = &es->entries[0]; + brelse(bh); + return es; + +free_es: + kfree(es); +release_bh: + brelse(bh); + return NULL; +} + +enum { + DIRENT_STEP_FILE, + DIRENT_STEP_STRM, + DIRENT_STEP_NAME, + DIRENT_STEP_SECD, +}; + +/* + * return values: + * >= 0 : return dir entiry position with the name in dir + * -EEXIST : (root dir, ".") it is the root dir itself + * -ENOENT : entry with the name does not exist + * -EIO : I/O error + */ +int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, + struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname, + int num_entries, unsigned int type) +{ + int i, rewind = 0, dentry = 0, end_eidx = 0, num_ext = 0, len; + int order, step, name_len = 0; + int dentries_per_clu, num_empty = 0; + unsigned int entry_type; + unsigned short *uniname = NULL; + struct exfat_chain clu; + struct exfat_hint *hint_stat = &ei->hint_stat; + struct exfat_hint_femp candi_empty; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + dentries_per_clu = sbi->dentries_per_clu; + + exfat_chain_dup(&clu, p_dir); + + if (hint_stat->eidx) { + clu.dir = hint_stat->clu; + dentry = hint_stat->eidx; + end_eidx = dentry; + } + + candi_empty.eidx = EXFAT_HINT_NONE; +rewind: + order = 0; + step = DIRENT_STEP_FILE; + while (clu.dir != EXFAT_EOF_CLUSTER) { + i = dentry & (dentries_per_clu - 1); + for (; i < dentries_per_clu; i++, dentry++) { + struct exfat_dentry *ep; + struct buffer_head *bh; + + if (rewind && dentry == end_eidx) + goto not_found; + + ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + if (!ep) + return -EIO; + + entry_type = exfat_get_entry_type(ep); + + if (entry_type == TYPE_UNUSED || + entry_type == TYPE_DELETED) { + step = DIRENT_STEP_FILE; + + num_empty++; + if (candi_empty.eidx == EXFAT_HINT_NONE && + num_empty == 1) { + exfat_chain_set(&candi_empty.cur, + clu.dir, clu.size, clu.flags); + } + + if (candi_empty.eidx == EXFAT_HINT_NONE && + num_empty >= num_entries) { + candi_empty.eidx = + dentry - (num_empty - 1); + WARN_ON(candi_empty.eidx < 0); + candi_empty.count = num_empty; + + if (ei->hint_femp.eidx == + EXFAT_HINT_NONE || + candi_empty.eidx <= + ei->hint_femp.eidx) { + memcpy(&ei->hint_femp, + &candi_empty, + sizeof(candi_empty)); + } + } + + brelse(bh); + if (entry_type == TYPE_UNUSED) + goto not_found; + continue; + } + + num_empty = 0; + candi_empty.eidx = EXFAT_HINT_NONE; + + if (entry_type == TYPE_FILE || entry_type == TYPE_DIR) { + step = DIRENT_STEP_FILE; + if (type == TYPE_ALL || type == entry_type) { + num_ext = ep->dentry.file.num_ext; + step = DIRENT_STEP_STRM; + } + brelse(bh); + continue; + } + + if (entry_type == TYPE_STREAM) { + unsigned short name_hash; + + if (step != DIRENT_STEP_STRM) { + step = DIRENT_STEP_FILE; + brelse(bh); + continue; + } + step = DIRENT_STEP_FILE; + name_hash = le16_to_cpu( + ep->dentry.stream.name_hash); + if (p_uniname->name_hash == name_hash && + p_uniname->name_len == + ep->dentry.stream.name_len) { + step = DIRENT_STEP_NAME; + order = 1; + name_len = 0; + } + brelse(bh); + continue; + } + + brelse(bh); + if (entry_type == TYPE_EXTEND) { + unsigned short entry_uniname[16], unichar; + + if (step != DIRENT_STEP_NAME) { + step = DIRENT_STEP_FILE; + continue; + } + + if (++order == 2) + uniname = p_uniname->name; + else + uniname += EXFAT_FILE_NAME_LEN; + + len = exfat_extract_uni_name(ep, entry_uniname); + name_len += len; + + unichar = *(uniname+len); + *(uniname+len) = 0x0; + + if (exfat_uniname_ncmp(sb, uniname, + entry_uniname, len)) { + step = DIRENT_STEP_FILE; + } else if (p_uniname->name_len == name_len) { + if (order == num_ext) + goto found; + step = DIRENT_STEP_SECD; + } + + *(uniname+len) = unichar; + continue; + } + + if (entry_type & + (TYPE_CRITICAL_SEC | TYPE_BENIGN_SEC)) { + if (step == DIRENT_STEP_SECD) { + if (++order == num_ext) + goto found; + continue; + } + } + step = DIRENT_STEP_FILE; + } + + if (clu.flags == ALLOC_NO_FAT_CHAIN) { + if (--clu.size > 0) + clu.dir++; + else + clu.dir = EXFAT_EOF_CLUSTER; + } else { + if (exfat_get_next_cluster(sb, &clu.dir)) + return -EIO; + } + } + +not_found: + /* + * We started at not 0 index,so we should try to find target + * from 0 index to the index we started at. + */ + if (!rewind && end_eidx) { + rewind = 1; + dentry = 0; + clu.dir = p_dir->dir; + /* reset empty hint */ + num_empty = 0; + candi_empty.eidx = EXFAT_HINT_NONE; + goto rewind; + } + + /* initialized hint_stat */ + hint_stat->clu = p_dir->dir; + hint_stat->eidx = 0; + return -ENOENT; + +found: + /* next dentry we'll find is out of this cluster */ + if (!((dentry + 1) & (dentries_per_clu - 1))) { + int ret = 0; + + if (clu.flags == ALLOC_NO_FAT_CHAIN) { + if (--clu.size > 0) + clu.dir++; + else + clu.dir = EXFAT_EOF_CLUSTER; + } else { + ret = exfat_get_next_cluster(sb, &clu.dir); + } + + if (ret || clu.dir != EXFAT_EOF_CLUSTER) { + /* just initialized hint_stat */ + hint_stat->clu = p_dir->dir; + hint_stat->eidx = 0; + return (dentry - num_ext); + } + } + + hint_stat->clu = clu.dir; + hint_stat->eidx = dentry + 1; + return dentry - num_ext; +} + +int exfat_count_ext_entries(struct super_block *sb, struct exfat_chain *p_dir, + int entry, struct exfat_dentry *ep) +{ + int i, count = 0; + unsigned int type; + struct exfat_dentry *ext_ep; + struct buffer_head *bh; + + for (i = 0, entry++; i < ep->dentry.file.num_ext; i++, entry++) { + ext_ep = exfat_get_dentry(sb, p_dir, entry, &bh, NULL); + if (!ext_ep) + return -EIO; + + type = exfat_get_entry_type(ext_ep); + brelse(bh); + if (type == TYPE_EXTEND || type == TYPE_STREAM) + count++; + else + break; + } + return count; +} + +int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir) +{ + int i, count = 0; + int dentries_per_clu; + unsigned int entry_type; + struct exfat_chain clu; + struct exfat_dentry *ep; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct buffer_head *bh; + + dentries_per_clu = sbi->dentries_per_clu; + + exfat_chain_dup(&clu, p_dir); + + while (clu.dir != EXFAT_EOF_CLUSTER) { + for (i = 0; i < dentries_per_clu; i++) { + ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + if (!ep) + return -EIO; + entry_type = exfat_get_entry_type(ep); + brelse(bh); + + if (entry_type == TYPE_UNUSED) + return count; + if (entry_type != TYPE_DIR) + continue; + count++; + } + + if (clu.flags == ALLOC_NO_FAT_CHAIN) { + if (--clu.size > 0) + clu.dir++; + else + clu.dir = EXFAT_EOF_CLUSTER; + } else { + if (exfat_get_next_cluster(sb, &(clu.dir))) + return -EIO; + } + } + + return count; +} diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h new file mode 100644 index 000000000000..67d4e46fb810 --- /dev/null +++ b/fs/exfat/exfat_fs.h @@ -0,0 +1,519 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#ifndef _EXFAT_FS_H +#define _EXFAT_FS_H + +#include <linux/fs.h> +#include <linux/ratelimit.h> +#include <linux/nls.h> + +#define EXFAT_SUPER_MAGIC 0x2011BAB0UL +#define EXFAT_ROOT_INO 1 + +#define EXFAT_SB_DIRTY 0 + +#define EXFAT_CLUSTERS_UNTRACKED (~0u) + +/* + * exfat error flags + */ +enum exfat_error_mode { + EXFAT_ERRORS_CONT, /* ignore error and continue */ + EXFAT_ERRORS_PANIC, /* panic on error */ + EXFAT_ERRORS_RO, /* remount r/o on error */ +}; + +/* + * exfat nls lossy flag + */ +enum { + NLS_NAME_NO_LOSSY, /* no lossy */ + NLS_NAME_LOSSY, /* just detected incorrect filename(s) */ + NLS_NAME_OVERLEN, /* the length is over than its limit */ +}; + +#define EXFAT_HASH_BITS 8 +#define EXFAT_HASH_SIZE (1UL << EXFAT_HASH_BITS) + +/* + * Type Definitions + */ +#define ES_2_ENTRIES 2 +#define ES_ALL_ENTRIES 0 + +#define DIR_DELETED 0xFFFF0321 + +/* type values */ +#define TYPE_UNUSED 0x0000 +#define TYPE_DELETED 0x0001 +#define TYPE_INVALID 0x0002 +#define TYPE_CRITICAL_PRI 0x0100 +#define TYPE_BITMAP 0x0101 +#define TYPE_UPCASE 0x0102 +#define TYPE_VOLUME 0x0103 +#define TYPE_DIR 0x0104 +#define TYPE_FILE 0x011F +#define TYPE_CRITICAL_SEC 0x0200 +#define TYPE_STREAM 0x0201 +#define TYPE_EXTEND 0x0202 +#define TYPE_ACL 0x0203 +#define TYPE_BENIGN_PRI 0x0400 +#define TYPE_GUID 0x0401 +#define TYPE_PADDING 0x0402 +#define TYPE_ACLTAB 0x0403 +#define TYPE_BENIGN_SEC 0x0800 +#define TYPE_ALL 0x0FFF + +#define MAX_CHARSET_SIZE 6 /* max size of multi-byte character */ +#define MAX_NAME_LENGTH 255 /* max len of file name excluding NULL */ +#define MAX_VFSNAME_BUF_SIZE ((MAX_NAME_LENGTH + 1) * MAX_CHARSET_SIZE) + +#define FAT_CACHE_SIZE 128 +#define FAT_CACHE_HASH_SIZE 64 +#define BUF_CACHE_SIZE 256 +#define BUF_CACHE_HASH_SIZE 64 + +#define EXFAT_HINT_NONE -1 +#define EXFAT_MIN_SUBDIR 2 + +/* + * helpers for cluster size to byte conversion. + */ +#define EXFAT_CLU_TO_B(b, sbi) ((b) << (sbi)->cluster_size_bits) +#define EXFAT_B_TO_CLU(b, sbi) ((b) >> (sbi)->cluster_size_bits) +#define EXFAT_B_TO_CLU_ROUND_UP(b, sbi) \ + (((b - 1) >> (sbi)->cluster_size_bits) + 1) +#define EXFAT_CLU_OFFSET(off, sbi) ((off) & ((sbi)->cluster_size - 1)) + +/* + * helpers for block size to byte conversion. + */ +#define EXFAT_BLK_TO_B(b, sb) ((b) << (sb)->s_blocksize_bits) +#define EXFAT_B_TO_BLK(b, sb) ((b) >> (sb)->s_blocksize_bits) +#define EXFAT_B_TO_BLK_ROUND_UP(b, sb) \ + (((b - 1) >> (sb)->s_blocksize_bits) + 1) +#define EXFAT_BLK_OFFSET(off, sb) ((off) & ((sb)->s_blocksize - 1)) + +/* + * helpers for block size to dentry size conversion. + */ +#define EXFAT_B_TO_DEN_IDX(b, sbi) \ + ((b) << ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS)) +#define EXFAT_B_TO_DEN(b) ((b) >> DENTRY_SIZE_BITS) +#define EXFAT_DEN_TO_B(b) ((b) << DENTRY_SIZE_BITS) + +/* + * helpers for fat entry. + */ +#define FAT_ENT_SIZE (4) +#define FAT_ENT_SIZE_BITS (2) +#define FAT_ENT_OFFSET_SECTOR(sb, loc) (EXFAT_SB(sb)->FAT1_start_sector + \ + (((u64)loc << FAT_ENT_SIZE_BITS) >> sb->s_blocksize_bits)) +#define FAT_ENT_OFFSET_BYTE_IN_SECTOR(sb, loc) \ + ((loc << FAT_ENT_SIZE_BITS) & (sb->s_blocksize - 1)) + +/* + * helpers for bitmap. + */ +#define CLUSTER_TO_BITMAP_ENT(clu) ((clu) - EXFAT_RESERVED_CLUSTERS) +#define BITMAP_ENT_TO_CLUSTER(ent) ((ent) + EXFAT_RESERVED_CLUSTERS) +#define BITS_PER_SECTOR(sb) ((sb)->s_blocksize * BITS_PER_BYTE) +#define BITS_PER_SECTOR_MASK(sb) (BITS_PER_SECTOR(sb) - 1) +#define BITMAP_OFFSET_SECTOR_INDEX(sb, ent) \ + ((ent / BITS_PER_BYTE) >> (sb)->s_blocksize_bits) +#define BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent) (ent & BITS_PER_SECTOR_MASK(sb)) +#define BITMAP_OFFSET_BYTE_IN_SECTOR(sb, ent) \ + ((ent / BITS_PER_BYTE) & ((sb)->s_blocksize - 1)) +#define BITS_PER_BYTE_MASK 0x7 +#define IGNORED_BITS_REMAINED(clu, clu_base) ((1 << ((clu) - (clu_base))) - 1) + +struct exfat_dentry_namebuf { + char *lfn; + int lfnbuf_len; /* usally MAX_UNINAME_BUF_SIZE */ +}; + +/* unicode name structure */ +struct exfat_uni_name { + /* +3 for null and for converting */ + unsigned short name[MAX_NAME_LENGTH + 3]; + unsigned short name_hash; + unsigned char name_len; +}; + +/* directory structure */ +struct exfat_chain { + unsigned int dir; + unsigned int size; + unsigned char flags; +}; + +/* first empty entry hint information */ +struct exfat_hint_femp { + /* entry index of a directory */ + int eidx; + /* count of continuous empty entry */ + int count; + /* the cluster that first empty slot exists in */ + struct exfat_chain cur; +}; + +/* hint structure */ +struct exfat_hint { + unsigned int clu; + union { + unsigned int off; /* cluster offset */ + int eidx; /* entry index */ + }; +}; + +struct exfat_entry_set_cache { + /* sector number that contains file_entry */ + sector_t sector; + /* byte offset in the sector */ + unsigned int offset; + /* flag in stream entry. 01 for cluster chain, 03 for contig. */ + int alloc_flag; + unsigned int num_entries; + struct exfat_dentry entries[]; +}; + +struct exfat_dir_entry { + struct exfat_chain dir; + int entry; + unsigned int type; + unsigned int start_clu; + unsigned char flags; + unsigned short attr; + loff_t size; + unsigned int num_subdirs; + struct timespec64 atime; + struct timespec64 mtime; + struct timespec64 crtime; + struct exfat_dentry_namebuf namebuf; +}; + +/* + * exfat mount in-memory data + */ +struct exfat_mount_options { + kuid_t fs_uid; + kgid_t fs_gid; + unsigned short fs_fmask; + unsigned short fs_dmask; + /* permission for setting the [am]time */ + unsigned short allow_utime; + /* charset for filename input/display */ + char *iocharset; + /* on error: continue, panic, remount-ro */ + enum exfat_error_mode errors; + unsigned utf8:1, /* Use of UTF-8 character set */ + discard:1; /* Issue discard requests on deletions */ + int time_offset; /* Offset of timestamps from UTC (in minutes) */ +}; + +/* + * EXFAT file system superblock in-memory data + */ +struct exfat_sb_info { + unsigned long long num_sectors; /* num of sectors in volume */ + unsigned int num_clusters; /* num of clusters in volume */ + unsigned int cluster_size; /* cluster size in bytes */ + unsigned int cluster_size_bits; + unsigned int sect_per_clus; /* cluster size in sectors */ + unsigned int sect_per_clus_bits; + unsigned long long FAT1_start_sector; /* FAT1 start sector */ + unsigned long long FAT2_start_sector; /* FAT2 start sector */ + unsigned long long data_start_sector; /* data area start sector */ + unsigned int num_FAT_sectors; /* num of FAT sectors */ + unsigned int root_dir; /* root dir cluster */ + unsigned int dentries_per_clu; /* num of dentries per cluster */ + unsigned int vol_flag; /* volume dirty flag */ + struct buffer_head *pbr_bh; /* buffer_head of PBR sector */ + + unsigned int map_clu; /* allocation bitmap start cluster */ + unsigned int map_sectors; /* num of allocation bitmap sectors */ + struct buffer_head **vol_amap; /* allocation bitmap */ + + unsigned short *vol_utbl; /* upcase table */ + + unsigned int clu_srch_ptr; /* cluster search pointer */ + unsigned int used_clusters; /* number of used clusters */ + + unsigned long s_state; + struct mutex s_lock; /* superblock lock */ + struct exfat_mount_options options; + struct nls_table *nls_io; /* Charset used for input and display */ + struct ratelimit_state ratelimit; + + spinlock_t inode_hash_lock; + struct hlist_head inode_hashtable[EXFAT_HASH_SIZE]; + + struct rcu_head rcu; +}; + +/* + * EXFAT file system inode in-memory data + */ +struct exfat_inode_info { + struct exfat_chain dir; + int entry; + unsigned int type; + unsigned short attr; + unsigned int start_clu; + unsigned char flags; + /* + * the copy of low 32bit of i_version to check + * the validation of hint_stat. + */ + unsigned int version; + /* file offset or dentry index for readdir */ + loff_t rwoffset; + + /* hint for cluster last accessed */ + struct exfat_hint hint_bmap; + /* hint for entry index we try to lookup next time */ + struct exfat_hint hint_stat; + /* hint for first empty entry */ + struct exfat_hint_femp hint_femp; + + spinlock_t cache_lru_lock; + struct list_head cache_lru; + int nr_caches; + /* for avoiding the race between alloc and free */ + unsigned int cache_valid_id; + + /* + * NOTE: i_size_ondisk is 64bits, so must hold ->inode_lock to access. + * physically allocated size. + */ + loff_t i_size_ondisk; + /* block-aligned i_size (used in cont_write_begin) */ + loff_t i_size_aligned; + /* on-disk position of directory entry or 0 */ + loff_t i_pos; + /* hash by i_location */ + struct hlist_node i_hash_fat; + /* protect bmap against truncate */ + struct rw_semaphore truncate_lock; + struct inode vfs_inode; + /* File creation time */ + struct timespec64 i_crtime; +}; + +static inline struct exfat_sb_info *EXFAT_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct exfat_inode_info *EXFAT_I(struct inode *inode) +{ + return container_of(inode, struct exfat_inode_info, vfs_inode); +} + +/* + * If ->i_mode can't hold 0222 (i.e. ATTR_RO), we use ->i_attrs to + * save ATTR_RO instead of ->i_mode. + * + * If it's directory and !sbi->options.rodir, ATTR_RO isn't read-only + * bit, it's just used as flag for app. + */ +static inline int exfat_mode_can_hold_ro(struct inode *inode) +{ + struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb); + + if (S_ISDIR(inode->i_mode)) + return 0; + + if ((~sbi->options.fs_fmask) & 0222) + return 1; + return 0; +} + +/* Convert attribute bits and a mask to the UNIX mode. */ +static inline mode_t exfat_make_mode(struct exfat_sb_info *sbi, + unsigned short attr, mode_t mode) +{ + if ((attr & ATTR_READONLY) && !(attr & ATTR_SUBDIR)) + mode &= ~0222; + + if (attr & ATTR_SUBDIR) + return (mode & ~sbi->options.fs_dmask) | S_IFDIR; + + return (mode & ~sbi->options.fs_fmask) | S_IFREG; +} + +/* Return the FAT attribute byte for this inode */ +static inline unsigned short exfat_make_attr(struct inode *inode) +{ + unsigned short attr = EXFAT_I(inode)->attr; + + if (S_ISDIR(inode->i_mode)) + attr |= ATTR_SUBDIR; + if (exfat_mode_can_hold_ro(inode) && !(inode->i_mode & 0222)) + attr |= ATTR_READONLY; + return attr; +} + +static inline void exfat_save_attr(struct inode *inode, unsigned short attr) +{ + if (exfat_mode_can_hold_ro(inode)) + EXFAT_I(inode)->attr = attr & (ATTR_RWMASK | ATTR_READONLY); + else + EXFAT_I(inode)->attr = attr & ATTR_RWMASK; +} + +static inline bool exfat_is_last_sector_in_cluster(struct exfat_sb_info *sbi, + sector_t sec) +{ + return ((sec - sbi->data_start_sector + 1) & + ((1 << sbi->sect_per_clus_bits) - 1)) == 0; +} + +static inline sector_t exfat_cluster_to_sector(struct exfat_sb_info *sbi, + unsigned int clus) +{ + return ((clus - EXFAT_RESERVED_CLUSTERS) << sbi->sect_per_clus_bits) + + sbi->data_start_sector; +} + +static inline int exfat_sector_to_cluster(struct exfat_sb_info *sbi, + sector_t sec) +{ + return ((sec - sbi->data_start_sector) >> sbi->sect_per_clus_bits) + + EXFAT_RESERVED_CLUSTERS; +} + +/* super.c */ +int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag); + +/* fatent.c */ +#define exfat_get_next_cluster(sb, pclu) exfat_ent_get(sb, *(pclu), pclu) + +int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, + struct exfat_chain *p_chain); +int exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain); +int exfat_ent_get(struct super_block *sb, unsigned int loc, + unsigned int *content); +int exfat_ent_set(struct super_block *sb, unsigned int loc, + unsigned int content); +int exfat_count_ext_entries(struct super_block *sb, struct exfat_chain *p_dir, + int entry, struct exfat_dentry *p_entry); +int exfat_chain_cont_cluster(struct super_block *sb, unsigned int chain, + unsigned int len); +int exfat_zeroed_cluster(struct inode *dir, unsigned int clu); +int exfat_find_last_cluster(struct super_block *sb, struct exfat_chain *p_chain, + unsigned int *ret_clu); +int exfat_count_num_clusters(struct super_block *sb, + struct exfat_chain *p_chain, unsigned int *ret_count); + +/* balloc.c */ +int exfat_load_bitmap(struct super_block *sb); +void exfat_free_bitmap(struct exfat_sb_info *sbi); +int exfat_set_bitmap(struct inode *inode, unsigned int clu); +void exfat_clear_bitmap(struct inode *inode, unsigned int clu); +unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu); +int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count); + +/* file.c */ +extern const struct file_operations exfat_file_operations; +int __exfat_truncate(struct inode *inode, loff_t new_size); +void exfat_truncate(struct inode *inode, loff_t size); +int exfat_setattr(struct dentry *dentry, struct iattr *attr); +int exfat_getattr(const struct path *path, struct kstat *stat, + unsigned int request_mask, unsigned int query_flags); + +/* namei.c */ +extern const struct dentry_operations exfat_dentry_ops; +extern const struct dentry_operations exfat_utf8_dentry_ops; + +/* cache.c */ +int exfat_cache_init(void); +void exfat_cache_shutdown(void); +void exfat_cache_init_inode(struct inode *inode); +void exfat_cache_inval_inode(struct inode *inode); +int exfat_get_cluster(struct inode *inode, unsigned int cluster, + unsigned int *fclus, unsigned int *dclus, + unsigned int *last_dclus, int allow_eof); + +/* dir.c */ +extern const struct inode_operations exfat_dir_inode_operations; +extern const struct file_operations exfat_dir_operations; +unsigned int exfat_get_entry_type(struct exfat_dentry *p_entry); +int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir, + int entry, unsigned int type, unsigned int start_clu, + unsigned long long size); +int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir, + int entry, int num_entries, struct exfat_uni_name *p_uniname); +int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir, + int entry, int order, int num_entries); +int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir, + int entry); +int exfat_update_dir_chksum_with_entry_set(struct super_block *sb, + struct exfat_entry_set_cache *es, int sync); +int exfat_calc_num_entries(struct exfat_uni_name *p_uniname); +int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei, + struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname, + int num_entries, unsigned int type); +int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu); +int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir, + int entry, sector_t *sector, int *offset); +struct exfat_dentry *exfat_get_dentry(struct super_block *sb, + struct exfat_chain *p_dir, int entry, struct buffer_head **bh, + sector_t *sector); +struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb, + struct exfat_chain *p_dir, int entry, unsigned int type, + struct exfat_dentry **file_ep); +int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir); + +/* inode.c */ +extern const struct inode_operations exfat_file_inode_operations; +void exfat_sync_inode(struct inode *inode); +struct inode *exfat_build_inode(struct super_block *sb, + struct exfat_dir_entry *info, loff_t i_pos); +void exfat_hash_inode(struct inode *inode, loff_t i_pos); +void exfat_unhash_inode(struct inode *inode); +struct inode *exfat_iget(struct super_block *sb, loff_t i_pos); +int exfat_write_inode(struct inode *inode, struct writeback_control *wbc); +void exfat_evict_inode(struct inode *inode); +int exfat_block_truncate_page(struct inode *inode, loff_t from); + +/* exfat/nls.c */ +unsigned short exfat_toupper(struct super_block *sb, unsigned short a); +int exfat_uniname_ncmp(struct super_block *sb, unsigned short *a, + unsigned short *b, unsigned int len); +int exfat_utf16_to_nls(struct super_block *sb, + struct exfat_uni_name *uniname, unsigned char *p_cstring, + int len); +int exfat_nls_to_utf16(struct super_block *sb, + const unsigned char *p_cstring, const int len, + struct exfat_uni_name *uniname, int *p_lossy); +int exfat_create_upcase_table(struct super_block *sb); +void exfat_free_upcase_table(struct exfat_sb_info *sbi); +unsigned short exfat_high_surrogate(unicode_t u); +unsigned short exfat_low_surrogate(unicode_t u); + +/* exfat/misc.c */ +void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...) + __printf(3, 4) __cold; +#define exfat_fs_error(sb, fmt, args...) \ + __exfat_fs_error(sb, 1, fmt, ## args) +#define exfat_fs_error_ratelimit(sb, fmt, args...) \ + __exfat_fs_error(sb, __ratelimit(&EXFAT_SB(sb)->ratelimit), \ + fmt, ## args) +void exfat_msg(struct super_block *sb, const char *lv, const char *fmt, ...) + __printf(3, 4) __cold; +void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, + u8 tz, __le16 time, __le16 date, u8 time_ms); +void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, + u8 *tz, __le16 *time, __le16 *date, u8 *time_ms); +unsigned short exfat_calc_chksum_2byte(void *data, int len, + unsigned short chksum, int type); +void exfat_update_bh(struct super_block *sb, struct buffer_head *bh, int sync); +void exfat_chain_set(struct exfat_chain *ec, unsigned int dir, + unsigned int size, unsigned char flags); +void exfat_chain_dup(struct exfat_chain *dup, struct exfat_chain *ec); + +#endif /* !_EXFAT_FS_H */ diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h new file mode 100644 index 000000000000..2a841010e649 --- /dev/null +++ b/fs/exfat/exfat_raw.h @@ -0,0 +1,184 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#ifndef _EXFAT_RAW_H +#define _EXFAT_RAW_H + +#include <linux/types.h> + +#define PBR_SIGNATURE 0xAA55 + +#define EXFAT_MAX_FILE_LEN 255 + +#define VOL_CLEAN 0x0000 +#define VOL_DIRTY 0x0002 + +#define EXFAT_EOF_CLUSTER 0xFFFFFFFFu +#define EXFAT_BAD_CLUSTER 0xFFFFFFF7u +#define EXFAT_FREE_CLUSTER 0 +/* Cluster 0, 1 are reserved, the first cluster is 2 in the cluster heap. */ +#define EXFAT_RESERVED_CLUSTERS 2 +#define EXFAT_FIRST_CLUSTER 2 +#define EXFAT_DATA_CLUSTER_COUNT(sbi) \ + ((sbi)->num_clusters - EXFAT_RESERVED_CLUSTERS) + +/* AllocationPossible and NoFatChain field in GeneralSecondaryFlags Field */ +#define ALLOC_FAT_CHAIN 0x01 +#define ALLOC_NO_FAT_CHAIN 0x03 + +#define DENTRY_SIZE 32 /* directory entry size */ +#define DENTRY_SIZE_BITS 5 +/* exFAT allows 8388608(256MB) directory entries */ +#define MAX_EXFAT_DENTRIES 8388608 + +/* dentry types */ +#define EXFAT_UNUSED 0x00 /* end of directory */ +#define EXFAT_DELETE (~0x80) +#define IS_EXFAT_DELETED(x) ((x) < 0x80) /* deleted file (0x01~0x7F) */ +#define EXFAT_INVAL 0x80 /* invalid value */ +#define EXFAT_BITMAP 0x81 /* allocation bitmap */ +#define EXFAT_UPCASE 0x82 /* upcase table */ +#define EXFAT_VOLUME 0x83 /* volume label */ +#define EXFAT_FILE 0x85 /* file or dir */ +#define EXFAT_GUID 0xA0 +#define EXFAT_PADDING 0xA1 +#define EXFAT_ACLTAB 0xA2 +#define EXFAT_STREAM 0xC0 /* stream entry */ +#define EXFAT_NAME 0xC1 /* file name entry */ +#define EXFAT_ACL 0xC2 /* stream entry */ + +#define IS_EXFAT_CRITICAL_PRI(x) (x < 0xA0) +#define IS_EXFAT_BENIGN_PRI(x) (x < 0xC0) +#define IS_EXFAT_CRITICAL_SEC(x) (x < 0xE0) + +/* checksum types */ +#define CS_DIR_ENTRY 0 +#define CS_PBR_SECTOR 1 +#define CS_DEFAULT 2 + +/* file attributes */ +#define ATTR_READONLY 0x0001 +#define ATTR_HIDDEN 0x0002 +#define ATTR_SYSTEM 0x0004 +#define ATTR_VOLUME 0x0008 +#define ATTR_SUBDIR 0x0010 +#define ATTR_ARCHIVE 0x0020 + +#define ATTR_RWMASK (ATTR_HIDDEN | ATTR_SYSTEM | ATTR_VOLUME | \ + ATTR_SUBDIR | ATTR_ARCHIVE) + +#define PBR64_JUMP_BOOT_LEN 3 +#define PBR64_OEM_NAME_LEN 8 +#define PBR64_RESERVED_LEN 53 + +#define EXFAT_FILE_NAME_LEN 15 + +/* EXFAT BIOS parameter block (64 bytes) */ +struct bpb64 { + __u8 jmp_boot[PBR64_JUMP_BOOT_LEN]; + __u8 oem_name[PBR64_OEM_NAME_LEN]; + __u8 res_zero[PBR64_RESERVED_LEN]; +} __packed; + +/* EXFAT EXTEND BIOS parameter block (56 bytes) */ +struct bsx64 { + __le64 vol_offset; + __le64 vol_length; + __le32 fat_offset; + __le32 fat_length; + __le32 clu_offset; + __le32 clu_count; + __le32 root_cluster; + __le32 vol_serial; + __u8 fs_version[2]; + __le16 vol_flags; + __u8 sect_size_bits; + __u8 sect_per_clus_bits; + __u8 num_fats; + __u8 phy_drv_no; + __u8 perc_in_use; + __u8 reserved2[7]; +} __packed; + +/* EXFAT PBR[BPB+BSX] (120 bytes) */ +struct pbr64 { + struct bpb64 bpb; + struct bsx64 bsx; +} __packed; + +/* Common PBR[Partition Boot Record] (512 bytes) */ +struct pbr { + union { + __u8 raw[64]; + struct bpb64 f64; + } bpb; + union { + __u8 raw[56]; + struct bsx64 f64; + } bsx; + __u8 boot_code[390]; + __le16 signature; +} __packed; + +struct exfat_dentry { + __u8 type; + union { + struct { + __u8 num_ext; + __le16 checksum; + __le16 attr; + __le16 reserved1; + __le16 create_time; + __le16 create_date; + __le16 modify_time; + __le16 modify_date; + __le16 access_time; + __le16 access_date; + __u8 create_time_ms; + __u8 modify_time_ms; + __u8 create_tz; + __u8 modify_tz; + __u8 access_tz; + __u8 reserved2[7]; + } __packed file; /* file directory entry */ + struct { + __u8 flags; + __u8 reserved1; + __u8 name_len; + __le16 name_hash; + __le16 reserved2; + __le64 valid_size; + __le32 reserved3; + __le32 start_clu; + __le64 size; + } __packed stream; /* stream extension directory entry */ + struct { + __u8 flags; + __le16 unicode_0_14[EXFAT_FILE_NAME_LEN]; + } __packed name; /* file name directory entry */ + struct { + __u8 flags; + __u8 reserved[18]; + __le32 start_clu; + __le64 size; + } __packed bitmap; /* allocation bitmap directory entry */ + struct { + __u8 reserved1[3]; + __le32 checksum; + __u8 reserved2[12]; + __le32 start_clu; + __le64 size; + } __packed upcase; /* up-case table directory entry */ + } __packed dentry; +} __packed; + +#define EXFAT_TZ_VALID (1 << 7) + +/* Jan 1 GMT 00:00:00 1980 */ +#define EXFAT_MIN_TIMESTAMP_SECS 315532800LL +/* Dec 31 GMT 23:59:59 2107 */ +#define EXFAT_MAX_TIMESTAMP_SECS 4354819199LL + +#endif /* !_EXFAT_RAW_H */ diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c new file mode 100644 index 000000000000..a855b1769a96 --- /dev/null +++ b/fs/exfat/fatent.c @@ -0,0 +1,463 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/slab.h> +#include <asm/unaligned.h> +#include <linux/buffer_head.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +static int exfat_mirror_bh(struct super_block *sb, sector_t sec, + struct buffer_head *bh) +{ + struct buffer_head *c_bh; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + sector_t sec2; + int err = 0; + + if (sbi->FAT2_start_sector != sbi->FAT1_start_sector) { + sec2 = sec - sbi->FAT1_start_sector + sbi->FAT2_start_sector; + c_bh = sb_getblk(sb, sec2); + if (!c_bh) + return -ENOMEM; + memcpy(c_bh->b_data, bh->b_data, sb->s_blocksize); + set_buffer_uptodate(c_bh); + mark_buffer_dirty(c_bh); + if (sb->s_flags & SB_SYNCHRONOUS) + err = sync_dirty_buffer(c_bh); + brelse(c_bh); + } + + return err; +} + +static int __exfat_ent_get(struct super_block *sb, unsigned int loc, + unsigned int *content) +{ + unsigned int off; + sector_t sec; + struct buffer_head *bh; + + sec = FAT_ENT_OFFSET_SECTOR(sb, loc); + off = FAT_ENT_OFFSET_BYTE_IN_SECTOR(sb, loc); + + bh = sb_bread(sb, sec); + if (!bh) + return -EIO; + + *content = le32_to_cpu(*(__le32 *)(&bh->b_data[off])); + + /* remap reserved clusters to simplify code */ + if (*content > EXFAT_BAD_CLUSTER) + *content = EXFAT_EOF_CLUSTER; + + brelse(bh); + return 0; +} + +int exfat_ent_set(struct super_block *sb, unsigned int loc, + unsigned int content) +{ + unsigned int off; + sector_t sec; + __le32 *fat_entry; + struct buffer_head *bh; + + sec = FAT_ENT_OFFSET_SECTOR(sb, loc); + off = FAT_ENT_OFFSET_BYTE_IN_SECTOR(sb, loc); + + bh = sb_bread(sb, sec); + if (!bh) + return -EIO; + + fat_entry = (__le32 *)&(bh->b_data[off]); + *fat_entry = cpu_to_le32(content); + exfat_update_bh(sb, bh, sb->s_flags & SB_SYNCHRONOUS); + exfat_mirror_bh(sb, sec, bh); + brelse(bh); + return 0; +} + +static inline bool is_valid_cluster(struct exfat_sb_info *sbi, + unsigned int clus) +{ + if (clus < EXFAT_FIRST_CLUSTER || sbi->num_clusters <= clus) + return false; + return true; +} + +int exfat_ent_get(struct super_block *sb, unsigned int loc, + unsigned int *content) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + int err; + + if (!is_valid_cluster(sbi, loc)) { + exfat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", + loc); + return -EIO; + } + + err = __exfat_ent_get(sb, loc, content); + if (err) { + exfat_fs_error(sb, + "failed to access to FAT (entry 0x%08x, err:%d)", + loc, err); + return err; + } + + if (*content == EXFAT_FREE_CLUSTER) { + exfat_fs_error(sb, + "invalid access to FAT free cluster (entry 0x%08x)", + loc); + return -EIO; + } + + if (*content == EXFAT_BAD_CLUSTER) { + exfat_fs_error(sb, + "invalid access to FAT bad cluster (entry 0x%08x)", + loc); + return -EIO; + } + + if (*content != EXFAT_EOF_CLUSTER && !is_valid_cluster(sbi, *content)) { + exfat_fs_error(sb, + "invalid access to FAT (entry 0x%08x) bogus content (0x%08x)", + loc, *content); + return -EIO; + } + + return 0; +} + +int exfat_chain_cont_cluster(struct super_block *sb, unsigned int chain, + unsigned int len) +{ + if (!len) + return 0; + + while (len > 1) { + if (exfat_ent_set(sb, chain, chain + 1)) + return -EIO; + chain++; + len--; + } + + if (exfat_ent_set(sb, chain, EXFAT_EOF_CLUSTER)) + return -EIO; + return 0; +} + +int exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain) +{ + unsigned int num_clusters = 0; + unsigned int clu; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + /* invalid cluster number */ + if (p_chain->dir == EXFAT_FREE_CLUSTER || + p_chain->dir == EXFAT_EOF_CLUSTER || + p_chain->dir < EXFAT_FIRST_CLUSTER) + return 0; + + /* no cluster to truncate */ + if (p_chain->size == 0) + return 0; + + /* check cluster validation */ + if (p_chain->dir < 2 && p_chain->dir >= sbi->num_clusters) { + exfat_msg(sb, KERN_ERR, "invalid start cluster (%u)", + p_chain->dir); + return -EIO; + } + + set_bit(EXFAT_SB_DIRTY, &sbi->s_state); + clu = p_chain->dir; + + if (p_chain->flags == ALLOC_NO_FAT_CHAIN) { + do { + exfat_clear_bitmap(inode, clu); + clu++; + + num_clusters++; + } while (num_clusters < p_chain->size); + } else { + do { + exfat_clear_bitmap(inode, clu); + + if (exfat_get_next_cluster(sb, &clu)) + goto dec_used_clus; + + num_clusters++; + } while (clu != EXFAT_EOF_CLUSTER); + } + +dec_used_clus: + sbi->used_clusters -= num_clusters; + return 0; +} + +int exfat_find_last_cluster(struct super_block *sb, struct exfat_chain *p_chain, + unsigned int *ret_clu) +{ + unsigned int clu, next; + unsigned int count = 0; + + next = p_chain->dir; + if (p_chain->flags == ALLOC_NO_FAT_CHAIN) { + *ret_clu = next + p_chain->size - 1; + return 0; + } + + do { + count++; + clu = next; + if (exfat_ent_get(sb, clu, &next)) + return -EIO; + } while (next != EXFAT_EOF_CLUSTER); + + if (p_chain->size != count) { + exfat_fs_error(sb, + "bogus directory size (clus : ondisk(%d) != counted(%d))", + p_chain->size, count); + return -EIO; + } + + *ret_clu = clu; + return 0; +} + +static inline int exfat_sync_bhs(struct buffer_head **bhs, int nr_bhs) +{ + int i, err = 0; + + for (i = 0; i < nr_bhs; i++) + write_dirty_buffer(bhs[i], 0); + + for (i = 0; i < nr_bhs; i++) { + wait_on_buffer(bhs[i]); + if (!err && !buffer_uptodate(bhs[i])) + err = -EIO; + } + return err; +} + +int exfat_zeroed_cluster(struct inode *dir, unsigned int clu) +{ + struct super_block *sb = dir->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct buffer_head *bhs[MAX_BUF_PER_PAGE]; + int nr_bhs = MAX_BUF_PER_PAGE; + sector_t blknr, last_blknr; + int err, i, n; + + blknr = exfat_cluster_to_sector(sbi, clu); + last_blknr = blknr + sbi->sect_per_clus; + + if (last_blknr > sbi->num_sectors && sbi->num_sectors > 0) { + exfat_fs_error_ratelimit(sb, + "%s: out of range(sect:%llu len:%u)", + __func__, (unsigned long long)blknr, + sbi->sect_per_clus); + return -EIO; + } + + /* Zeroing the unused blocks on this cluster */ + n = 0; + while (blknr < last_blknr) { + bhs[n] = sb_getblk(sb, blknr); + if (!bhs[n]) { + err = -ENOMEM; + goto release_bhs; + } + memset(bhs[n]->b_data, 0, sb->s_blocksize); + exfat_update_bh(sb, bhs[n], 0); + + n++; + blknr++; + + if (n == nr_bhs) { + if (IS_DIRSYNC(dir)) { + err = exfat_sync_bhs(bhs, n); + if (err) + goto release_bhs; + } + + for (i = 0; i < n; i++) + brelse(bhs[i]); + n = 0; + } + } + + if (IS_DIRSYNC(dir)) { + err = exfat_sync_bhs(bhs, n); + if (err) + goto release_bhs; + } + + for (i = 0; i < n; i++) + brelse(bhs[i]); + + return 0; + +release_bhs: + exfat_msg(sb, KERN_ERR, "failed zeroed sect %llu\n", + (unsigned long long)blknr); + for (i = 0; i < n; i++) + bforget(bhs[i]); + return err; +} + +int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, + struct exfat_chain *p_chain) +{ + int ret = -ENOSPC; + unsigned int num_clusters = 0, total_cnt; + unsigned int hint_clu, new_clu, last_clu = EXFAT_EOF_CLUSTER; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + total_cnt = EXFAT_DATA_CLUSTER_COUNT(sbi); + + if (unlikely(total_cnt < sbi->used_clusters)) { + exfat_fs_error_ratelimit(sb, + "%s: invalid used clusters(t:%u,u:%u)\n", + __func__, total_cnt, sbi->used_clusters); + return -EIO; + } + + if (num_alloc > total_cnt - sbi->used_clusters) + return -ENOSPC; + + hint_clu = p_chain->dir; + /* find new cluster */ + if (hint_clu == EXFAT_EOF_CLUSTER) { + if (sbi->clu_srch_ptr < EXFAT_FIRST_CLUSTER) { + exfat_msg(sb, KERN_ERR, + "sbi->clu_srch_ptr is invalid (%u)\n", + sbi->clu_srch_ptr); + sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER; + } + + hint_clu = exfat_find_free_bitmap(sb, sbi->clu_srch_ptr); + if (hint_clu == EXFAT_EOF_CLUSTER) + return -ENOSPC; + } + + /* check cluster validation */ + if (hint_clu < EXFAT_FIRST_CLUSTER && hint_clu >= sbi->num_clusters) { + exfat_msg(sb, KERN_ERR, "hint_cluster is invalid (%u)\n", + hint_clu); + hint_clu = EXFAT_FIRST_CLUSTER; + if (p_chain->flags == ALLOC_NO_FAT_CHAIN) { + if (exfat_chain_cont_cluster(sb, p_chain->dir, + num_clusters)) + return -EIO; + p_chain->flags = ALLOC_FAT_CHAIN; + } + } + + set_bit(EXFAT_SB_DIRTY, &sbi->s_state); + + p_chain->dir = EXFAT_EOF_CLUSTER; + + while ((new_clu = exfat_find_free_bitmap(sb, hint_clu)) != + EXFAT_EOF_CLUSTER) { + if (new_clu != hint_clu && + p_chain->flags == ALLOC_NO_FAT_CHAIN) { + if (exfat_chain_cont_cluster(sb, p_chain->dir, + num_clusters)) { + ret = -EIO; + goto free_cluster; + } + p_chain->flags = ALLOC_FAT_CHAIN; + } + + /* update allocation bitmap */ + if (exfat_set_bitmap(inode, new_clu)) { + ret = -EIO; + goto free_cluster; + } + + num_clusters++; + + /* update FAT table */ + if (p_chain->flags == ALLOC_FAT_CHAIN) { + if (exfat_ent_set(sb, new_clu, EXFAT_EOF_CLUSTER)) { + ret = -EIO; + goto free_cluster; + } + } + + if (p_chain->dir == EXFAT_EOF_CLUSTER) { + p_chain->dir = new_clu; + } else if (p_chain->flags == ALLOC_FAT_CHAIN) { + if (exfat_ent_set(sb, last_clu, new_clu)) { + ret = -EIO; + goto free_cluster; + } + } + last_clu = new_clu; + + if (--num_alloc == 0) { + sbi->clu_srch_ptr = hint_clu; + sbi->used_clusters += num_clusters; + + p_chain->size += num_clusters; + return 0; + } + + hint_clu = new_clu + 1; + if (hint_clu >= sbi->num_clusters) { + hint_clu = EXFAT_FIRST_CLUSTER; + + if (p_chain->flags == ALLOC_NO_FAT_CHAIN) { + if (exfat_chain_cont_cluster(sb, p_chain->dir, + num_clusters)) { + ret = -EIO; + goto free_cluster; + } + p_chain->flags = ALLOC_FAT_CHAIN; + } + } + } +free_cluster: + if (num_clusters) + exfat_free_cluster(inode, p_chain); + return ret; +} + +int exfat_count_num_clusters(struct super_block *sb, + struct exfat_chain *p_chain, unsigned int *ret_count) +{ + unsigned int i, count; + unsigned int clu; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + if (!p_chain->dir || p_chain->dir == EXFAT_EOF_CLUSTER) { + *ret_count = 0; + return 0; + } + + if (p_chain->flags == ALLOC_NO_FAT_CHAIN) { + *ret_count = p_chain->size; + return 0; + } + + clu = p_chain->dir; + count = 0; + for (i = EXFAT_FIRST_CLUSTER; i < sbi->num_clusters; i++) { + count++; + if (exfat_ent_get(sb, clu, &clu)) + return -EIO; + if (clu == EXFAT_EOF_CLUSTER) + break; + } + + *ret_count = count; + return 0; +} diff --git a/fs/exfat/file.c b/fs/exfat/file.c new file mode 100644 index 000000000000..483f683757aa --- /dev/null +++ b/fs/exfat/file.c @@ -0,0 +1,360 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/slab.h> +#include <linux/cred.h> +#include <linux/buffer_head.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +static int exfat_cont_expand(struct inode *inode, loff_t size) +{ + struct address_space *mapping = inode->i_mapping; + loff_t start = i_size_read(inode), count = size - i_size_read(inode); + int err, err2; + + err = generic_cont_expand_simple(inode, size); + if (err) + return err; + + inode->i_ctime = inode->i_mtime = current_time(inode); + mark_inode_dirty(inode); + + if (!IS_SYNC(inode)) + return 0; + + err = filemap_fdatawrite_range(mapping, start, start + count - 1); + err2 = sync_mapping_buffers(mapping); + if (!err) + err = err2; + err2 = write_inode_now(inode, 1); + if (!err) + err = err2; + if (err) + return err; + + return filemap_fdatawait_range(mapping, start, start + count - 1); +} + +static bool exfat_allow_set_time(struct exfat_sb_info *sbi, struct inode *inode) +{ + mode_t allow_utime = sbi->options.allow_utime; + + if (!uid_eq(current_fsuid(), inode->i_uid)) { + if (in_group_p(inode->i_gid)) + allow_utime >>= 3; + if (allow_utime & MAY_WRITE) + return true; + } + + /* use a default check */ + return false; +} + +static int exfat_sanitize_mode(const struct exfat_sb_info *sbi, + struct inode *inode, umode_t *mode_ptr) +{ + mode_t i_mode, mask, perm; + + i_mode = inode->i_mode; + + mask = (S_ISREG(i_mode) || S_ISLNK(i_mode)) ? + sbi->options.fs_fmask : sbi->options.fs_dmask; + perm = *mode_ptr & ~(S_IFMT | mask); + + /* Of the r and x bits, all (subject to umask) must be present.*/ + if ((perm & 0555) != (i_mode & 0555)) + return -EPERM; + + if (exfat_mode_can_hold_ro(inode)) { + /* + * Of the w bits, either all (subject to umask) or none must + * be present. + */ + if ((perm & 0222) && ((perm & 0222) != (0222 & ~mask))) + return -EPERM; + } else { + /* + * If exfat_mode_can_hold_ro(inode) is false, can't change + * w bits. + */ + if ((perm & 0222) != (0222 & ~mask)) + return -EPERM; + } + + *mode_ptr &= S_IFMT | perm; + + return 0; +} + +/* resize the file length */ +int __exfat_truncate(struct inode *inode, loff_t new_size) +{ + unsigned int num_clusters_new, num_clusters_phys; + unsigned int last_clu = EXFAT_FREE_CLUSTER; + struct exfat_chain clu; + struct exfat_dentry *ep, *ep2; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(inode); + struct exfat_entry_set_cache *es = NULL; + int evict = (ei->dir.dir == DIR_DELETED) ? 1 : 0; + + /* check if the given file ID is opened */ + if (ei->type != TYPE_FILE && ei->type != TYPE_DIR) + return -EPERM; + + exfat_set_vol_flags(sb, VOL_DIRTY); + + num_clusters_new = EXFAT_B_TO_CLU_ROUND_UP(i_size_read(inode), sbi); + num_clusters_phys = + EXFAT_B_TO_CLU_ROUND_UP(EXFAT_I(inode)->i_size_ondisk, sbi); + + exfat_chain_set(&clu, ei->start_clu, num_clusters_phys, ei->flags); + + if (new_size > 0) { + /* + * Truncate FAT chain num_clusters after the first cluster + * num_clusters = min(new, phys); + */ + unsigned int num_clusters = + min(num_clusters_new, num_clusters_phys); + + /* + * Follow FAT chain + * (defensive coding - works fine even with corrupted FAT table + */ + if (clu.flags == ALLOC_NO_FAT_CHAIN) { + clu.dir += num_clusters; + clu.size -= num_clusters; + } else { + while (num_clusters > 0) { + last_clu = clu.dir; + if (exfat_get_next_cluster(sb, &(clu.dir))) + return -EIO; + + num_clusters--; + clu.size--; + } + } + } else { + ei->flags = ALLOC_NO_FAT_CHAIN; + ei->start_clu = EXFAT_EOF_CLUSTER; + } + + i_size_write(inode, new_size); + + if (ei->type == TYPE_FILE) + ei->attr |= ATTR_ARCHIVE; + + /* update the directory entry */ + if (!evict) { + struct timespec64 ts; + + es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, + ES_ALL_ENTRIES, &ep); + if (!es) + return -EIO; + ep2 = ep + 1; + + ts = current_time(inode); + exfat_set_entry_time(sbi, &ts, + &ep->dentry.file.modify_tz, + &ep->dentry.file.modify_time, + &ep->dentry.file.modify_date, + &ep->dentry.file.modify_time_ms); + ep->dentry.file.attr = cpu_to_le16(ei->attr); + + /* File size should be zero if there is no cluster allocated */ + if (ei->start_clu == EXFAT_EOF_CLUSTER) { + ep->dentry.stream.valid_size = 0; + ep->dentry.stream.size = 0; + } else { + ep->dentry.stream.valid_size = cpu_to_le64(new_size); + ep->dentry.stream.size = ep->dentry.stream.valid_size; + } + + if (new_size == 0) { + /* Any directory can not be truncated to zero */ + WARN_ON(ei->type != TYPE_FILE); + + ep2->dentry.stream.flags = ALLOC_FAT_CHAIN; + ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER; + } + + if (exfat_update_dir_chksum_with_entry_set(sb, es, + inode_needs_sync(inode))) + return -EIO; + kfree(es); + } + + /* cut off from the FAT chain */ + if (ei->flags == ALLOC_FAT_CHAIN && last_clu != EXFAT_FREE_CLUSTER && + last_clu != EXFAT_EOF_CLUSTER) { + if (exfat_ent_set(sb, last_clu, EXFAT_EOF_CLUSTER)) + return -EIO; + } + + /* invalidate cache and free the clusters */ + /* clear exfat cache */ + exfat_cache_inval_inode(inode); + + /* hint information */ + ei->hint_bmap.off = EXFAT_EOF_CLUSTER; + ei->hint_bmap.clu = EXFAT_EOF_CLUSTER; + if (ei->rwoffset > new_size) + ei->rwoffset = new_size; + + /* hint_stat will be used if this is directory. */ + ei->hint_stat.eidx = 0; + ei->hint_stat.clu = ei->start_clu; + ei->hint_femp.eidx = EXFAT_HINT_NONE; + + /* free the clusters */ + if (exfat_free_cluster(inode, &clu)) + return -EIO; + + exfat_set_vol_flags(sb, VOL_CLEAN); + + return 0; +} + +void exfat_truncate(struct inode *inode, loff_t size) +{ + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + unsigned int blocksize = 1 << inode->i_blkbits; + loff_t aligned_size; + int err; + + mutex_lock(&sbi->s_lock); + if (EXFAT_I(inode)->start_clu == 0) { + /* + * Empty start_clu != ~0 (not allocated) + */ + exfat_fs_error(sb, "tried to truncate zeroed cluster."); + goto write_size; + } + + err = __exfat_truncate(inode, i_size_read(inode)); + if (err) + goto write_size; + + inode->i_ctime = inode->i_mtime = current_time(inode); + if (IS_DIRSYNC(inode)) + exfat_sync_inode(inode); + else + mark_inode_dirty(inode); + + inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) & + ~(sbi->cluster_size - 1)) >> inode->i_blkbits; +write_size: + aligned_size = i_size_read(inode); + if (aligned_size & (blocksize - 1)) { + aligned_size |= (blocksize - 1); + aligned_size++; + } + + if (EXFAT_I(inode)->i_size_ondisk > i_size_read(inode)) + EXFAT_I(inode)->i_size_ondisk = aligned_size; + + if (EXFAT_I(inode)->i_size_aligned > i_size_read(inode)) + EXFAT_I(inode)->i_size_aligned = aligned_size; + mutex_unlock(&sbi->s_lock); +} + +int exfat_getattr(const struct path *path, struct kstat *stat, + unsigned int request_mask, unsigned int query_flags) +{ + struct inode *inode = d_backing_inode(path->dentry); + struct exfat_inode_info *ei = EXFAT_I(inode); + + generic_fillattr(inode, stat); + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = ei->i_crtime.tv_sec; + stat->btime.tv_nsec = ei->i_crtime.tv_nsec; + stat->blksize = EXFAT_SB(inode->i_sb)->cluster_size; + return 0; +} + +int exfat_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct exfat_sb_info *sbi = EXFAT_SB(dentry->d_sb); + struct inode *inode = dentry->d_inode; + unsigned int ia_valid; + int error; + + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size > i_size_read(inode)) { + error = exfat_cont_expand(inode, attr->ia_size); + if (error || attr->ia_valid == ATTR_SIZE) + return error; + attr->ia_valid &= ~ATTR_SIZE; + } + + /* Check for setting the inode time. */ + ia_valid = attr->ia_valid; + if ((ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) && + exfat_allow_set_time(sbi, inode)) { + attr->ia_valid &= ~(ATTR_MTIME_SET | ATTR_ATIME_SET | + ATTR_TIMES_SET); + } + + error = setattr_prepare(dentry, attr); + attr->ia_valid = ia_valid; + if (error) + goto out; + + if (((attr->ia_valid & ATTR_UID) && + !uid_eq(attr->ia_uid, sbi->options.fs_uid)) || + ((attr->ia_valid & ATTR_GID) && + !gid_eq(attr->ia_gid, sbi->options.fs_gid)) || + ((attr->ia_valid & ATTR_MODE) && + (attr->ia_mode & ~(S_IFREG | S_IFLNK | S_IFDIR | 0777)))) { + error = -EPERM; + goto out; + } + + /* + * We don't return -EPERM here. Yes, strange, but this is too + * old behavior. + */ + if (attr->ia_valid & ATTR_MODE) { + if (exfat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0) + attr->ia_valid &= ~ATTR_MODE; + } + + if (attr->ia_valid & ATTR_SIZE) { + error = exfat_block_truncate_page(inode, attr->ia_size); + if (error) + goto out; + + down_write(&EXFAT_I(inode)->truncate_lock); + truncate_setsize(inode, attr->ia_size); + exfat_truncate(inode, attr->ia_size); + up_write(&EXFAT_I(inode)->truncate_lock); + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + +out: + return error; +} + +const struct file_operations exfat_file_operations = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .fsync = generic_file_fsync, + .splice_read = generic_file_splice_read, +}; + +const struct inode_operations exfat_file_inode_operations = { + .setattr = exfat_setattr, + .getattr = exfat_getattr, +}; diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c new file mode 100644 index 000000000000..06887492f54b --- /dev/null +++ b/fs/exfat/inode.c @@ -0,0 +1,671 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/init.h> +#include <linux/buffer_head.h> +#include <linux/mpage.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/time.h> +#include <linux/writeback.h> +#include <linux/uio.h> +#include <linux/random.h> +#include <linux/iversion.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +static int __exfat_write_inode(struct inode *inode, int sync) +{ + int ret = -EIO; + unsigned long long on_disk_size; + struct exfat_dentry *ep, *ep2; + struct exfat_entry_set_cache *es = NULL; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(inode); + bool is_dir = (ei->type == TYPE_DIR) ? true : false; + + if (inode->i_ino == EXFAT_ROOT_INO) + return 0; + + /* + * If the indode is already unlinked, there is no need for updating it. + */ + if (ei->dir.dir == DIR_DELETED) + return 0; + + if (is_dir && ei->dir.dir == sbi->root_dir && ei->entry == -1) + return 0; + + exfat_set_vol_flags(sb, VOL_DIRTY); + + /* get the directory entry of given file or directory */ + es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES, + &ep); + if (!es) + return -EIO; + ep2 = ep + 1; + + ep->dentry.file.attr = cpu_to_le16(exfat_make_attr(inode)); + + /* set FILE_INFO structure using the acquired struct exfat_dentry */ + exfat_set_entry_time(sbi, &ei->i_crtime, + &ep->dentry.file.create_tz, + &ep->dentry.file.create_time, + &ep->dentry.file.create_date, + &ep->dentry.file.create_time_ms); + exfat_set_entry_time(sbi, &inode->i_mtime, + &ep->dentry.file.modify_tz, + &ep->dentry.file.modify_time, + &ep->dentry.file.modify_date, + &ep->dentry.file.modify_time_ms); + exfat_set_entry_time(sbi, &inode->i_atime, + &ep->dentry.file.access_tz, + &ep->dentry.file.access_time, + &ep->dentry.file.access_date, + NULL); + + /* File size should be zero if there is no cluster allocated */ + on_disk_size = i_size_read(inode); + + if (ei->start_clu == EXFAT_EOF_CLUSTER) + on_disk_size = 0; + + ep2->dentry.stream.valid_size = cpu_to_le64(on_disk_size); + ep2->dentry.stream.size = ep2->dentry.stream.valid_size; + + ret = exfat_update_dir_chksum_with_entry_set(sb, es, sync); + kfree(es); + return ret; +} + +int exfat_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int ret; + + mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock); + ret = __exfat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); + mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock); + + return ret; +} + +void exfat_sync_inode(struct inode *inode) +{ + lockdep_assert_held(&EXFAT_SB(inode->i_sb)->s_lock); + __exfat_write_inode(inode, 1); +} + +/* + * Input: inode, (logical) clu_offset, target allocation area + * Output: errcode, cluster number + * *clu = (~0), if it's unable to allocate a new cluster + */ +static int exfat_map_cluster(struct inode *inode, unsigned int clu_offset, + unsigned int *clu, int create) +{ + int ret, modified = false; + unsigned int last_clu; + struct exfat_chain new_clu; + struct exfat_dentry *ep; + struct exfat_entry_set_cache *es = NULL; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(inode); + unsigned int local_clu_offset = clu_offset; + unsigned int num_to_be_allocated = 0, num_clusters = 0; + + ei->rwoffset = EXFAT_CLU_TO_B(clu_offset, sbi); + + if (EXFAT_I(inode)->i_size_ondisk > 0) + num_clusters = + EXFAT_B_TO_CLU_ROUND_UP(EXFAT_I(inode)->i_size_ondisk, + sbi); + + if (clu_offset >= num_clusters) + num_to_be_allocated = clu_offset - num_clusters + 1; + + if (!create && (num_to_be_allocated > 0)) { + *clu = EXFAT_EOF_CLUSTER; + return 0; + } + + *clu = last_clu = ei->start_clu; + + if (ei->flags == ALLOC_NO_FAT_CHAIN) { + if (clu_offset > 0 && *clu != EXFAT_EOF_CLUSTER) { + last_clu += clu_offset - 1; + + if (clu_offset == num_clusters) + *clu = EXFAT_EOF_CLUSTER; + else + *clu += clu_offset; + } + } else if (ei->type == TYPE_FILE) { + unsigned int fclus = 0; + int err = exfat_get_cluster(inode, clu_offset, + &fclus, clu, &last_clu, 1); + if (err) + return -EIO; + + clu_offset -= fclus; + } else { + /* hint information */ + if (clu_offset > 0 && ei->hint_bmap.off != EXFAT_EOF_CLUSTER && + ei->hint_bmap.off > 0 && clu_offset >= ei->hint_bmap.off) { + clu_offset -= ei->hint_bmap.off; + /* hint_bmap.clu should be valid */ + WARN_ON(ei->hint_bmap.clu < 2); + *clu = ei->hint_bmap.clu; + } + + while (clu_offset > 0 && *clu != EXFAT_EOF_CLUSTER) { + last_clu = *clu; + if (exfat_get_next_cluster(sb, clu)) + return -EIO; + clu_offset--; + } + } + + if (*clu == EXFAT_EOF_CLUSTER) { + exfat_set_vol_flags(sb, VOL_DIRTY); + + new_clu.dir = (last_clu == EXFAT_EOF_CLUSTER) ? + EXFAT_EOF_CLUSTER : last_clu + 1; + new_clu.size = 0; + new_clu.flags = ei->flags; + + /* allocate a cluster */ + if (num_to_be_allocated < 1) { + /* Broken FAT (i_sze > allocated FAT) */ + exfat_fs_error(sb, "broken FAT chain."); + return -EIO; + } + + ret = exfat_alloc_cluster(inode, num_to_be_allocated, &new_clu); + if (ret) + return ret; + + if (new_clu.dir == EXFAT_EOF_CLUSTER || + new_clu.dir == EXFAT_FREE_CLUSTER) { + exfat_fs_error(sb, + "bogus cluster new allocated (last_clu : %u, new_clu : %u)", + last_clu, new_clu.dir); + return -EIO; + } + + /* append to the FAT chain */ + if (last_clu == EXFAT_EOF_CLUSTER) { + if (new_clu.flags == ALLOC_FAT_CHAIN) + ei->flags = ALLOC_FAT_CHAIN; + ei->start_clu = new_clu.dir; + modified = true; + } else { + if (new_clu.flags != ei->flags) { + /* no-fat-chain bit is disabled, + * so fat-chain should be synced with + * alloc-bitmap + */ + exfat_chain_cont_cluster(sb, ei->start_clu, + num_clusters); + ei->flags = ALLOC_FAT_CHAIN; + modified = true; + } + if (new_clu.flags == ALLOC_FAT_CHAIN) + if (exfat_ent_set(sb, last_clu, new_clu.dir)) + return -EIO; + } + + num_clusters += num_to_be_allocated; + *clu = new_clu.dir; + + if (ei->dir.dir != DIR_DELETED) { + es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, + ES_ALL_ENTRIES, &ep); + if (!es) + return -EIO; + /* get stream entry */ + ep++; + + /* update directory entry */ + if (modified) { + if (ep->dentry.stream.flags != ei->flags) + ep->dentry.stream.flags = ei->flags; + + if (le32_to_cpu(ep->dentry.stream.start_clu) != + ei->start_clu) + ep->dentry.stream.start_clu = + cpu_to_le32(ei->start_clu); + + ep->dentry.stream.valid_size = + cpu_to_le64(i_size_read(inode)); + ep->dentry.stream.size = + ep->dentry.stream.valid_size; + } + + if (exfat_update_dir_chksum_with_entry_set(sb, es, + inode_needs_sync(inode))) + return -EIO; + kfree(es); + + } /* end of if != DIR_DELETED */ + + inode->i_blocks += + num_to_be_allocated << sbi->sect_per_clus_bits; + + /* + * Move *clu pointer along FAT chains (hole care) because the + * caller of this function expect *clu to be the last cluster. + * This only works when num_to_be_allocated >= 2, + * *clu = (the first cluster of the allocated chain) => + * (the last cluster of ...) + */ + if (ei->flags == ALLOC_NO_FAT_CHAIN) { + *clu += num_to_be_allocated - 1; + } else { + while (num_to_be_allocated > 1) { + if (exfat_get_next_cluster(sb, clu)) + return -EIO; + num_to_be_allocated--; + } + } + + } + + /* hint information */ + ei->hint_bmap.off = local_clu_offset; + ei->hint_bmap.clu = *clu; + + return 0; +} + +static int exfat_map_new_buffer(struct exfat_inode_info *ei, + struct buffer_head *bh, loff_t pos) +{ + if (buffer_delay(bh) && pos > ei->i_size_aligned) + return -EIO; + set_buffer_new(bh); + + /* + * Adjust i_size_aligned if i_size_ondisk is bigger than it. + */ + if (ei->i_size_ondisk > ei->i_size_aligned) + ei->i_size_aligned = ei->i_size_ondisk; + return 0; +} + +static int exfat_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct exfat_inode_info *ei = EXFAT_I(inode); + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + int err = 0; + unsigned long mapped_blocks = 0; + unsigned int cluster, sec_offset; + sector_t last_block; + sector_t phys = 0; + loff_t pos; + + mutex_lock(&sbi->s_lock); + last_block = EXFAT_B_TO_BLK_ROUND_UP(i_size_read(inode), sb); + if (iblock >= last_block && !create) + goto done; + + /* Is this block already allocated? */ + err = exfat_map_cluster(inode, iblock >> sbi->sect_per_clus_bits, + &cluster, create); + if (err) { + if (err != -ENOSPC) + exfat_fs_error_ratelimit(sb, + "failed to bmap (inode : %p iblock : %llu, err : %d)", + inode, (unsigned long long)iblock, err); + goto unlock_ret; + } + + if (cluster == EXFAT_EOF_CLUSTER) + goto done; + + /* sector offset in cluster */ + sec_offset = iblock & (sbi->sect_per_clus - 1); + + phys = exfat_cluster_to_sector(sbi, cluster) + sec_offset; + mapped_blocks = sbi->sect_per_clus - sec_offset; + max_blocks = min(mapped_blocks, max_blocks); + + /* Treat newly added block / cluster */ + if (iblock < last_block) + create = 0; + + if (create || buffer_delay(bh_result)) { + pos = EXFAT_BLK_TO_B((iblock + 1), sb); + if (ei->i_size_ondisk < pos) + ei->i_size_ondisk = pos; + } + + if (create) { + err = exfat_map_new_buffer(ei, bh_result, pos); + if (err) { + exfat_fs_error(sb, + "requested for bmap out of range(pos : (%llu) > i_size_aligned(%llu)\n", + pos, ei->i_size_aligned); + goto unlock_ret; + } + } + + if (buffer_delay(bh_result)) + clear_buffer_delay(bh_result); + map_bh(bh_result, sb, phys); +done: + bh_result->b_size = EXFAT_BLK_TO_B(max_blocks, sb); +unlock_ret: + mutex_unlock(&sbi->s_lock); + return err; +} + +static int exfat_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, exfat_get_block); +} + +static int exfat_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned int nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, exfat_get_block); +} + +static int exfat_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, exfat_get_block, wbc); +} + +static int exfat_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, exfat_get_block); +} + +static void exfat_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > i_size_read(inode)) { + truncate_pagecache(inode, i_size_read(inode)); + exfat_truncate(inode, EXFAT_I(inode)->i_size_aligned); + } +} + +static int exfat_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned int len, unsigned int flags, + struct page **pagep, void **fsdata) +{ + int ret; + + *pagep = NULL; + ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + exfat_get_block, + &EXFAT_I(mapping->host)->i_size_ondisk); + + if (ret < 0) + exfat_write_failed(mapping, pos+len); + + return ret; +} + +static int exfat_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned int len, unsigned int copied, + struct page *pagep, void *fsdata) +{ + struct inode *inode = mapping->host; + struct exfat_inode_info *ei = EXFAT_I(inode); + int err; + + err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); + + if (EXFAT_I(inode)->i_size_aligned < i_size_read(inode)) { + exfat_fs_error(inode->i_sb, + "invalid size(size(%llu) > aligned(%llu)\n", + i_size_read(inode), EXFAT_I(inode)->i_size_aligned); + return -EIO; + } + + if (err < len) + exfat_write_failed(mapping, pos+len); + + if (!(err < 0) && !(ei->attr & ATTR_ARCHIVE)) { + inode->i_mtime = inode->i_ctime = current_time(inode); + ei->attr |= ATTR_ARCHIVE; + mark_inode_dirty(inode); + } + + return err; +} + +static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + loff_t size = iocb->ki_pos + iov_iter_count(iter); + int rw = iov_iter_rw(iter); + ssize_t ret; + + if (rw == WRITE) { + /* + * FIXME: blockdev_direct_IO() doesn't use ->write_begin(), + * so we need to update the ->i_size_aligned to block boundary. + * + * But we must fill the remaining area or hole by nul for + * updating ->i_size_aligned + * + * Return 0, and fallback to normal buffered write. + */ + if (EXFAT_I(inode)->i_size_aligned < size) + return 0; + } + + /* + * Need to use the DIO_LOCKING for avoiding the race + * condition of exfat_get_block() and ->truncate(). + */ + ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block); + if (ret < 0 && (rw & WRITE)) + exfat_write_failed(mapping, size); + return ret; +} + +static sector_t exfat_aop_bmap(struct address_space *mapping, sector_t block) +{ + sector_t blocknr; + + /* exfat_get_cluster() assumes the requested blocknr isn't truncated. */ + down_read(&EXFAT_I(mapping->host)->truncate_lock); + blocknr = generic_block_bmap(mapping, block, exfat_get_block); + up_read(&EXFAT_I(mapping->host)->truncate_lock); + return blocknr; +} + +/* + * exfat_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This is required during truncate to physically zeroout the tail end + * of that block so it doesn't yield old data if the file is later grown. + * Also, avoid causing failure from fsx for cases of "data past EOF" + */ +int exfat_block_truncate_page(struct inode *inode, loff_t from) +{ + return block_truncate_page(inode->i_mapping, from, exfat_get_block); +} + +static const struct address_space_operations exfat_aops = { + .readpage = exfat_readpage, + .readpages = exfat_readpages, + .writepage = exfat_writepage, + .writepages = exfat_writepages, + .write_begin = exfat_write_begin, + .write_end = exfat_write_end, + .direct_IO = exfat_direct_IO, + .bmap = exfat_aop_bmap +}; + +static inline unsigned long exfat_hash(loff_t i_pos) +{ + return hash_32(i_pos, EXFAT_HASH_BITS); +} + +void exfat_hash_inode(struct inode *inode, loff_t i_pos) +{ + struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb); + struct hlist_head *head = sbi->inode_hashtable + exfat_hash(i_pos); + + spin_lock(&sbi->inode_hash_lock); + EXFAT_I(inode)->i_pos = i_pos; + hlist_add_head(&EXFAT_I(inode)->i_hash_fat, head); + spin_unlock(&sbi->inode_hash_lock); +} + +void exfat_unhash_inode(struct inode *inode) +{ + struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb); + + spin_lock(&sbi->inode_hash_lock); + hlist_del_init(&EXFAT_I(inode)->i_hash_fat); + EXFAT_I(inode)->i_pos = 0; + spin_unlock(&sbi->inode_hash_lock); +} + +struct inode *exfat_iget(struct super_block *sb, loff_t i_pos) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *info; + struct hlist_head *head = sbi->inode_hashtable + exfat_hash(i_pos); + struct inode *inode = NULL; + + spin_lock(&sbi->inode_hash_lock); + hlist_for_each_entry(info, head, i_hash_fat) { + WARN_ON(info->vfs_inode.i_sb != sb); + + if (i_pos != info->i_pos) + continue; + inode = igrab(&info->vfs_inode); + if (inode) + break; + } + spin_unlock(&sbi->inode_hash_lock); + return inode; +} + +/* doesn't deal with root inode */ +static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) +{ + struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb); + struct exfat_inode_info *ei = EXFAT_I(inode); + loff_t size = info->size; + + memcpy(&ei->dir, &info->dir, sizeof(struct exfat_chain)); + ei->entry = info->entry; + ei->attr = info->attr; + ei->start_clu = info->start_clu; + ei->flags = info->flags; + ei->type = info->type; + + ei->version = 0; + ei->hint_stat.eidx = 0; + ei->hint_stat.clu = info->start_clu; + ei->hint_femp.eidx = EXFAT_HINT_NONE; + ei->rwoffset = 0; + ei->hint_bmap.off = EXFAT_EOF_CLUSTER; + ei->i_pos = 0; + + inode->i_uid = sbi->options.fs_uid; + inode->i_gid = sbi->options.fs_gid; + inode_inc_iversion(inode); + inode->i_generation = prandom_u32(); + + if (info->attr & ATTR_SUBDIR) { /* directory */ + inode->i_generation &= ~1; + inode->i_mode = exfat_make_mode(sbi, info->attr, 0777); + inode->i_op = &exfat_dir_inode_operations; + inode->i_fop = &exfat_dir_operations; + set_nlink(inode, info->num_subdirs); + } else { /* regular file */ + inode->i_generation |= 1; + inode->i_mode = exfat_make_mode(sbi, info->attr, 0777); + inode->i_op = &exfat_file_inode_operations; + inode->i_fop = &exfat_file_operations; + inode->i_mapping->a_ops = &exfat_aops; + inode->i_mapping->nrpages = 0; + } + + i_size_write(inode, size); + + /* ondisk and aligned size should be aligned with block size */ + if (size & (inode->i_sb->s_blocksize - 1)) { + size |= (inode->i_sb->s_blocksize - 1); + size++; + } + + ei->i_size_aligned = size; + ei->i_size_ondisk = size; + + exfat_save_attr(inode, info->attr); + + inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) & + ~(sbi->cluster_size - 1)) >> inode->i_blkbits; + inode->i_mtime = info->mtime; + inode->i_ctime = info->mtime; + ei->i_crtime = info->crtime; + inode->i_atime = info->atime; + + exfat_cache_init_inode(inode); + + return 0; +} + +struct inode *exfat_build_inode(struct super_block *sb, + struct exfat_dir_entry *info, loff_t i_pos) +{ + struct inode *inode; + int err; + + inode = exfat_iget(sb, i_pos); + if (inode) + goto out; + inode = new_inode(sb); + if (!inode) { + inode = ERR_PTR(-ENOMEM); + goto out; + } + inode->i_ino = iunique(sb, EXFAT_ROOT_INO); + inode_set_iversion(inode, 1); + err = exfat_fill_inode(inode, info); + if (err) { + iput(inode); + inode = ERR_PTR(err); + goto out; + } + exfat_hash_inode(inode, i_pos); + insert_inode_hash(inode); +out: + return inode; +} + +void exfat_evict_inode(struct inode *inode) +{ + truncate_inode_pages(&inode->i_data, 0); + + if (!inode->i_nlink) { + i_size_write(inode, 0); + mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock); + __exfat_truncate(inode, 0); + mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock); + } + + invalidate_inode_buffers(inode); + clear_inode(inode); + exfat_cache_inval_inode(inode); + exfat_unhash_inode(inode); +} diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c new file mode 100644 index 000000000000..14a3300848f6 --- /dev/null +++ b/fs/exfat/misc.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Written 1992,1993 by Werner Almesberger + * 22/11/2000 - Fixed fat_date_unix2dos for dates earlier than 01/01/1980 + * and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru) + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +/* + * exfat_fs_error reports a file system problem that might indicate fa data + * corruption/inconsistency. Depending on 'errors' mount option the + * panic() is called, or error message is printed FAT and nothing is done, + * or filesystem is remounted read-only (default behavior). + * In case the file system is remounted read-only, it can be made writable + * again by remounting it. + */ +void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...) +{ + struct exfat_mount_options *opts = &EXFAT_SB(sb)->options; + va_list args; + struct va_format vaf; + + if (report) { + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + exfat_msg(sb, KERN_ERR, "error, %pV\n", &vaf); + va_end(args); + } + + if (opts->errors == EXFAT_ERRORS_PANIC) { + panic("exFAT-fs (%s): fs panic from previous error\n", + sb->s_id); + } else if (opts->errors == EXFAT_ERRORS_RO && !sb_rdonly(sb)) { + sb->s_flags |= SB_RDONLY; + exfat_msg(sb, KERN_ERR, "Filesystem has been set read-only"); + } +} + +/* + * exfat_msg() - print preformated EXFAT specific messages. + * All logs except what uses exfat_fs_error() should be written by exfat_msg() + */ +void exfat_msg(struct super_block *sb, const char *level, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + /* level means KERN_ pacility level */ + printk("%sexFAT-fs (%s): %pV\n", level, sb->s_id, &vaf); + va_end(args); +} + +#define SECS_PER_MIN (60) +#define TIMEZONE_SEC(x) ((x) * 15 * SECS_PER_MIN) + +static void exfat_adjust_tz(struct timespec64 *ts, u8 tz_off) +{ + if (tz_off <= 0x3F) + ts->tv_sec -= TIMEZONE_SEC(tz_off); + else /* 0x40 <= (tz_off & 0x7F) <=0x7F */ + ts->tv_sec += TIMEZONE_SEC(0x80 - tz_off); +} + +/* Convert a EXFAT time/date pair to a UNIX date (seconds since 1 1 70). */ +void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, + u8 tz, __le16 time, __le16 date, u8 time_ms) +{ + u16 t = le16_to_cpu(time); + u16 d = le16_to_cpu(date); + + ts->tv_sec = mktime64(1980 + (d >> 9), d >> 5 & 0x000F, d & 0x001F, + t >> 11, (t >> 5) & 0x003F, (t & 0x001F) << 1); + + + /* time_ms field represent 0 ~ 199(1990 ms) */ + if (time_ms) { + ts->tv_sec += time_ms / 100; + ts->tv_nsec = (time_ms % 100) * 10 * NSEC_PER_MSEC; + } + + if (tz & EXFAT_TZ_VALID) + /* Adjust timezone to UTC0. */ + exfat_adjust_tz(ts, tz & ~EXFAT_TZ_VALID); + else + /* Convert from local time to UTC using time_offset. */ + ts->tv_sec -= sbi->options.time_offset * SECS_PER_MIN; +} + +/* Convert linear UNIX date to a EXFAT time/date pair. */ +void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, + u8 *tz, __le16 *time, __le16 *date, u8 *time_ms) +{ + struct tm tm; + u16 t, d; + + time64_to_tm(ts->tv_sec, 0, &tm); + t = (tm.tm_hour << 11) | (tm.tm_min << 5) | (tm.tm_sec >> 1); + d = ((tm.tm_year - 80) << 9) | ((tm.tm_mon + 1) << 5) | tm.tm_mday; + + *time = cpu_to_le16(t); + *date = cpu_to_le16(d); + + /* time_ms field represent 0 ~ 199(1990 ms) */ + if (time_ms) + *time_ms = (tm.tm_sec & 1) * 100 + + ts->tv_nsec / (10 * NSEC_PER_MSEC); + + /* + * Record 00h value for OffsetFromUtc field and 1 value for OffsetValid + * to indicate that local time and UTC are the same. + */ + *tz = EXFAT_TZ_VALID; +} + +unsigned short exfat_calc_chksum_2byte(void *data, int len, + unsigned short chksum, int type) +{ + int i; + unsigned char *c = (unsigned char *)data; + + for (i = 0; i < len; i++, c++) { + if (((i == 2) || (i == 3)) && (type == CS_DIR_ENTRY)) + continue; + chksum = (((chksum & 1) << 15) | ((chksum & 0xFFFE) >> 1)) + + (unsigned short)*c; + } + return chksum; +} + +void exfat_update_bh(struct super_block *sb, struct buffer_head *bh, int sync) +{ + set_bit(EXFAT_SB_DIRTY, &EXFAT_SB(sb)->s_state); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + + if (sync) + sync_dirty_buffer(bh); +} + +void exfat_chain_set(struct exfat_chain *ec, unsigned int dir, + unsigned int size, unsigned char flags) +{ + ec->dir = dir; + ec->size = size; + ec->flags = flags; +} + +void exfat_chain_dup(struct exfat_chain *dup, struct exfat_chain *ec) +{ + return exfat_chain_set(dup, ec->dir, ec->size, ec->flags); +} diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c new file mode 100644 index 000000000000..a8681d91f569 --- /dev/null +++ b/fs/exfat/namei.c @@ -0,0 +1,1448 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/iversion.h> +#include <linux/namei.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> +#include <linux/nls.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +static inline unsigned long exfat_d_version(struct dentry *dentry) +{ + return (unsigned long) dentry->d_fsdata; +} + +static inline void exfat_d_version_set(struct dentry *dentry, + unsigned long version) +{ + dentry->d_fsdata = (void *) version; +} + +/* + * If new entry was created in the parent, it could create the 8.3 alias (the + * shortname of logname). So, the parent may have the negative-dentry which + * matches the created 8.3 alias. + * + * If it happened, the negative dentry isn't actually negative anymore. So, + * drop it. + */ +static int exfat_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + int ret; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + /* + * This is not negative dentry. Always valid. + * + * Note, rename() to existing directory entry will have ->d_inode, and + * will use existing name which isn't specified name by user. + * + * We may be able to drop this positive dentry here. But dropping + * positive dentry isn't good idea. So it's unsupported like + * rename("filename", "FILENAME") for now. + */ + if (d_really_is_positive(dentry)) + return 1; + + /* + * Drop the negative dentry, in order to make sure to use the case + * sensitive name which is specified by user if this is for creation. + */ + if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) + return 0; + + spin_lock(&dentry->d_lock); + ret = inode_eq_iversion(d_inode(dentry->d_parent), + exfat_d_version(dentry)); + spin_unlock(&dentry->d_lock); + return ret; +} + +/* returns the length of a struct qstr, ignoring trailing dots */ +static unsigned int exfat_striptail_len(unsigned int len, const char *name) +{ + while (len && name[len - 1] == '.') + len--; + return len; +} + +/* + * Compute the hash for the exfat name corresponding to the dentry. If the name + * is invalid, we leave the hash code unchanged so that the existing dentry can + * be used. The exfat fs routines will return ENOENT or EINVAL as appropriate. + */ +static int exfat_d_hash(const struct dentry *dentry, struct qstr *qstr) +{ + struct super_block *sb = dentry->d_sb; + struct nls_table *t = EXFAT_SB(sb)->nls_io; + const unsigned char *name = qstr->name; + unsigned int len = exfat_striptail_len(qstr->len, qstr->name); + unsigned long hash = init_name_hash(dentry); + int i, charlen; + wchar_t c; + + for (i = 0; i < len; i += charlen) { + charlen = t->char2uni(&name[i], len - i, &c); + if (charlen < 0) + return charlen; + hash = partial_name_hash(exfat_toupper(sb, c), hash); + } + + qstr->hash = end_name_hash(hash); + return 0; +} + +static int exfat_d_cmp(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) +{ + struct super_block *sb = dentry->d_sb; + struct nls_table *t = EXFAT_SB(sb)->nls_io; + unsigned int alen = exfat_striptail_len(name->len, name->name); + unsigned int blen = exfat_striptail_len(len, str); + wchar_t c1, c2; + int charlen, i; + + if (alen != blen) + return 1; + + for (i = 0; i < len; i += charlen) { + charlen = t->char2uni(&name->name[i], alen - i, &c1); + if (charlen < 0) + return 1; + if (charlen != t->char2uni(&str[i], blen - i, &c2)) + return 1; + + if (exfat_toupper(sb, c1) != exfat_toupper(sb, c2)) + return 1; + } + + return 0; +} + +const struct dentry_operations exfat_dentry_ops = { + .d_revalidate = exfat_d_revalidate, + .d_hash = exfat_d_hash, + .d_compare = exfat_d_cmp, +}; + +static int exfat_utf8_d_hash(const struct dentry *dentry, struct qstr *qstr) +{ + struct super_block *sb = dentry->d_sb; + const unsigned char *name = qstr->name; + unsigned int len = exfat_striptail_len(qstr->len, qstr->name); + unsigned long hash = init_name_hash(dentry); + int i, charlen; + unicode_t u; + + for (i = 0; i < len; i += charlen) { + charlen = utf8_to_utf32(&name[i], len - i, &u); + if (charlen < 0) + return charlen; + + /* + * Convert to UTF-16: code points above U+FFFF are encoded as + * surrogate pairs. + * exfat_toupper() works only for code points up to the U+FFFF. + */ + if (u > 0xFFFF) { + hash = partial_name_hash(exfat_high_surrogate(u), hash); + hash = partial_name_hash(exfat_low_surrogate(u), hash); + } else { + hash = partial_name_hash(exfat_toupper(sb, u), hash); + } + } + + qstr->hash = end_name_hash(hash); + return 0; +} + +static int exfat_utf8_d_cmp(const struct dentry *dentry, unsigned int len, + const char *str, const struct qstr *name) +{ + struct super_block *sb = dentry->d_sb; + unsigned int alen = exfat_striptail_len(name->len, name->name); + unsigned int blen = exfat_striptail_len(len, str); + unicode_t u_a, u_b; + int charlen, i; + + if (alen != blen) + return 1; + + for (i = 0; i < alen; i += charlen) { + charlen = utf8_to_utf32(&name->name[i], alen - i, &u_a); + if (charlen < 0) + return 1; + if (charlen != utf8_to_utf32(&str[i], blen - i, &u_b)) + return 1; + + if (u_a <= 0xFFFF && u_b <= 0xFFFF) { + if (exfat_toupper(sb, u_a) != exfat_toupper(sb, u_b)) + return 1; + } else if (u_a > 0xFFFF && u_b > 0xFFFF) { + if (exfat_low_surrogate(u_a) != + exfat_low_surrogate(u_b) || + exfat_high_surrogate(u_a) != + exfat_high_surrogate(u_b)) + return 1; + } else { + return 1; + } + } + + return 0; +} + +const struct dentry_operations exfat_utf8_dentry_ops = { + .d_revalidate = exfat_d_revalidate, + .d_hash = exfat_utf8_d_hash, + .d_compare = exfat_utf8_d_cmp, +}; + +/* used only in search empty_slot() */ +#define CNT_UNUSED_NOHIT (-1) +#define CNT_UNUSED_HIT (-2) +/* search EMPTY CONTINUOUS "num_entries" entries */ +static int exfat_search_empty_slot(struct super_block *sb, + struct exfat_hint_femp *hint_femp, struct exfat_chain *p_dir, + int num_entries) +{ + int i, dentry, num_empty = 0; + int dentries_per_clu; + unsigned int type; + struct exfat_chain clu; + struct exfat_dentry *ep; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct buffer_head *bh; + + dentries_per_clu = sbi->dentries_per_clu; + + if (hint_femp->eidx != EXFAT_HINT_NONE) { + dentry = hint_femp->eidx; + if (num_entries <= hint_femp->count) { + hint_femp->eidx = EXFAT_HINT_NONE; + return dentry; + } + + exfat_chain_dup(&clu, &hint_femp->cur); + } else { + exfat_chain_dup(&clu, p_dir); + dentry = 0; + } + + while (clu.dir != EXFAT_EOF_CLUSTER) { + i = dentry & (dentries_per_clu - 1); + + for (; i < dentries_per_clu; i++, dentry++) { + ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + if (!ep) + return -EIO; + type = exfat_get_entry_type(ep); + brelse(bh); + + if (type == TYPE_UNUSED || type == TYPE_DELETED) { + num_empty++; + if (hint_femp->eidx == EXFAT_HINT_NONE) { + hint_femp->eidx = dentry; + hint_femp->count = CNT_UNUSED_NOHIT; + exfat_chain_set(&hint_femp->cur, + clu.dir, clu.size, clu.flags); + } + + if (type == TYPE_UNUSED && + hint_femp->count != CNT_UNUSED_HIT) + hint_femp->count = CNT_UNUSED_HIT; + } else { + if (hint_femp->eidx != EXFAT_HINT_NONE && + hint_femp->count == CNT_UNUSED_HIT) { + /* unused empty group means + * an empty group which includes + * unused dentry + */ + exfat_fs_error(sb, + "found bogus dentry(%d) beyond unused empty group(%d) (start_clu : %u, cur_clu : %u)", + dentry, hint_femp->eidx, + p_dir->dir, clu.dir); + return -EIO; + } + + num_empty = 0; + hint_femp->eidx = EXFAT_HINT_NONE; + } + + if (num_empty >= num_entries) { + /* found and invalidate hint_femp */ + hint_femp->eidx = EXFAT_HINT_NONE; + return (dentry - (num_entries - 1)); + } + } + + if (clu.flags == ALLOC_NO_FAT_CHAIN) { + if (--clu.size > 0) + clu.dir++; + else + clu.dir = EXFAT_EOF_CLUSTER; + } else { + if (exfat_get_next_cluster(sb, &clu.dir)) + return -EIO; + } + } + + return -ENOSPC; +} + +static int exfat_check_max_dentries(struct inode *inode) +{ + if (EXFAT_B_TO_DEN(i_size_read(inode)) >= MAX_EXFAT_DENTRIES) { + /* + * exFAT spec allows a dir to grow upto 8388608(256MB) + * dentries + */ + return -ENOSPC; + } + return 0; +} + +/* find empty directory entry. + * if there isn't any empty slot, expand cluster chain. + */ +static int exfat_find_empty_entry(struct inode *inode, + struct exfat_chain *p_dir, int num_entries) +{ + int dentry; + unsigned int ret, last_clu; + sector_t sector; + loff_t size = 0; + struct exfat_chain clu; + struct exfat_dentry *ep = NULL; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(inode); + struct exfat_hint_femp hint_femp; + + hint_femp.eidx = EXFAT_HINT_NONE; + + if (ei->hint_femp.eidx != EXFAT_HINT_NONE) { + memcpy(&hint_femp, &ei->hint_femp, + sizeof(struct exfat_hint_femp)); + ei->hint_femp.eidx = EXFAT_HINT_NONE; + } + + while ((dentry = exfat_search_empty_slot(sb, &hint_femp, p_dir, + num_entries)) < 0) { + if (dentry == -EIO) + break; + + if (exfat_check_max_dentries(inode)) + return -ENOSPC; + + /* we trust p_dir->size regardless of FAT type */ + if (exfat_find_last_cluster(sb, p_dir, &last_clu)) + return -EIO; + + /* + * Allocate new cluster to this directory + */ + exfat_chain_set(&clu, last_clu + 1, 0, p_dir->flags); + + /* allocate a cluster */ + ret = exfat_alloc_cluster(inode, 1, &clu); + if (ret) + return ret; + + if (exfat_zeroed_cluster(inode, clu.dir)) + return -EIO; + + /* append to the FAT chain */ + if (clu.flags != p_dir->flags) { + /* no-fat-chain bit is disabled, + * so fat-chain should be synced with alloc-bitmap + */ + exfat_chain_cont_cluster(sb, p_dir->dir, p_dir->size); + p_dir->flags = ALLOC_FAT_CHAIN; + hint_femp.cur.flags = ALLOC_FAT_CHAIN; + } + + if (clu.flags == ALLOC_FAT_CHAIN) + if (exfat_ent_set(sb, last_clu, clu.dir)) + return -EIO; + + if (hint_femp.eidx == EXFAT_HINT_NONE) { + /* the special case that new dentry + * should be allocated from the start of new cluster + */ + hint_femp.eidx = EXFAT_B_TO_DEN_IDX(p_dir->size, sbi); + hint_femp.count = sbi->dentries_per_clu; + + exfat_chain_set(&hint_femp.cur, clu.dir, 0, clu.flags); + } + hint_femp.cur.size++; + p_dir->size++; + size = EXFAT_CLU_TO_B(p_dir->size, sbi); + + /* update the directory entry */ + if (p_dir->dir != sbi->root_dir) { + struct buffer_head *bh; + + ep = exfat_get_dentry(sb, + &(ei->dir), ei->entry + 1, &bh, §or); + if (!ep) + return -EIO; + + ep->dentry.stream.valid_size = cpu_to_le64(size); + ep->dentry.stream.size = ep->dentry.stream.valid_size; + ep->dentry.stream.flags = p_dir->flags; + exfat_update_bh(sb, bh, IS_DIRSYNC(inode)); + brelse(bh); + if (exfat_update_dir_chksum(inode, &(ei->dir), + ei->entry)) + return -EIO; + } + + /* directory inode should be updated in here */ + i_size_write(inode, size); + EXFAT_I(inode)->i_size_ondisk += sbi->cluster_size; + EXFAT_I(inode)->i_size_aligned += sbi->cluster_size; + EXFAT_I(inode)->flags = p_dir->flags; + inode->i_blocks += 1 << sbi->sect_per_clus_bits; + } + + return dentry; +} + +/* + * Name Resolution Functions : + * Zero if it was successful; otherwise nonzero. + */ +static int __exfat_resolve_path(struct inode *inode, const unsigned char *path, + struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname, + int lookup) +{ + int namelen; + int lossy = NLS_NAME_NO_LOSSY; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(inode); + + /* strip all trailing periods */ + namelen = exfat_striptail_len(strlen(path), path); + if (!namelen) + return -ENOENT; + + if (strlen(path) > (MAX_NAME_LENGTH * MAX_CHARSET_SIZE)) + return -ENAMETOOLONG; + + /* + * strip all leading spaces : + * "MS windows 7" supports leading spaces. + * So we should skip this preprocessing for compatibility. + */ + + /* file name conversion : + * If lookup case, we allow bad-name for compatibility. + */ + namelen = exfat_nls_to_utf16(sb, path, namelen, p_uniname, + &lossy); + if (namelen < 0) + return namelen; /* return error value */ + + if ((lossy && !lookup) || !namelen) + return -EINVAL; + + exfat_chain_set(p_dir, ei->start_clu, + EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags); + + return 0; +} + +static inline int exfat_resolve_path(struct inode *inode, + const unsigned char *path, struct exfat_chain *dir, + struct exfat_uni_name *uni) +{ + return __exfat_resolve_path(inode, path, dir, uni, 0); +} + +static inline int exfat_resolve_path_for_lookup(struct inode *inode, + const unsigned char *path, struct exfat_chain *dir, + struct exfat_uni_name *uni) +{ + return __exfat_resolve_path(inode, path, dir, uni, 1); +} + +static inline loff_t exfat_make_i_pos(struct exfat_dir_entry *info) +{ + return ((loff_t) info->dir.dir << 32) | (info->entry & 0xffffffff); +} + +static int exfat_add_entry(struct inode *inode, const char *path, + struct exfat_chain *p_dir, unsigned int type, + struct exfat_dir_entry *info) +{ + int ret, dentry, num_entries; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_uni_name uniname; + struct exfat_chain clu; + int clu_size = 0; + unsigned int start_clu = EXFAT_FREE_CLUSTER; + + ret = exfat_resolve_path(inode, path, p_dir, &uniname); + if (ret) + goto out; + + num_entries = exfat_calc_num_entries(&uniname); + if (num_entries < 0) { + ret = num_entries; + goto out; + } + + /* exfat_find_empty_entry must be called before alloc_cluster() */ + dentry = exfat_find_empty_entry(inode, p_dir, num_entries); + if (dentry < 0) { + ret = dentry; /* -EIO or -ENOSPC */ + goto out; + } + + if (type == TYPE_DIR) { + ret = exfat_alloc_new_dir(inode, &clu); + if (ret) + goto out; + start_clu = clu.dir; + clu_size = sbi->cluster_size; + } + + /* update the directory entry */ + /* fill the dos name directory entry information of the created file. + * the first cluster is not determined yet. (0) + */ + ret = exfat_init_dir_entry(inode, p_dir, dentry, type, + start_clu, clu_size); + if (ret) + goto out; + + ret = exfat_init_ext_entry(inode, p_dir, dentry, num_entries, &uniname); + if (ret) + goto out; + + memcpy(&info->dir, p_dir, sizeof(struct exfat_chain)); + info->entry = dentry; + info->flags = ALLOC_NO_FAT_CHAIN; + info->type = type; + + if (type == TYPE_FILE) { + info->attr = ATTR_ARCHIVE; + info->start_clu = EXFAT_EOF_CLUSTER; + info->size = 0; + info->num_subdirs = 0; + } else { + int count; + struct exfat_chain cdir; + + info->attr = ATTR_SUBDIR; + info->start_clu = start_clu; + info->size = clu_size; + + exfat_chain_set(&cdir, info->start_clu, + EXFAT_B_TO_CLU(info->size, sbi), info->flags); + count = exfat_count_dir_entries(sb, &cdir); + if (count < 0) + return -EIO; + info->num_subdirs = count + EXFAT_MIN_SUBDIR; + } + memset(&info->crtime, 0, sizeof(info->crtime)); + memset(&info->mtime, 0, sizeof(info->mtime)); + memset(&info->atime, 0, sizeof(info->atime)); +out: + return ret; +} + +static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + struct exfat_chain cdir; + struct exfat_dir_entry info; + loff_t i_pos; + int err; + + mutex_lock(&EXFAT_SB(sb)->s_lock); + exfat_set_vol_flags(sb, VOL_DIRTY); + err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_FILE, + &info); + exfat_set_vol_flags(sb, VOL_CLEAN); + if (err) + goto unlock; + + inode_inc_iversion(dir); + dir->i_ctime = dir->i_mtime = current_time(dir); + if (IS_DIRSYNC(dir)) + exfat_sync_inode(dir); + else + mark_inode_dirty(dir); + + i_pos = exfat_make_i_pos(&info); + inode = exfat_build_inode(sb, &info, i_pos); + if (IS_ERR(inode)) + goto unlock; + + inode_inc_iversion(inode); + inode->i_mtime = inode->i_atime = inode->i_ctime = + EXFAT_I(inode)->i_crtime = current_time(inode); + /* timestamp is already written, so mark_inode_dirty() is unneeded. */ + + d_instantiate(dentry, inode); +unlock: + mutex_unlock(&EXFAT_SB(sb)->s_lock); + return err; +} + +/* lookup a file */ +static int exfat_find(struct inode *dir, struct qstr *qname, + struct exfat_dir_entry *info) +{ + int ret, dentry, num_entries, count; + struct exfat_chain cdir; + struct exfat_uni_name uni_name; + struct exfat_dentry *ep, *ep2; + struct exfat_entry_set_cache *es = NULL; + struct super_block *sb = dir->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(dir); + + if (qname->len == 0) + return -ENOENT; + + /* check the validity of directory name in the given pathname */ + ret = exfat_resolve_path_for_lookup(dir, qname->name, &cdir, &uni_name); + if (ret) + return ret; + + num_entries = exfat_calc_num_entries(&uni_name); + if (num_entries < 0) + return num_entries; + + /* check the validation of hint_stat and initialize it if required */ + if (ei->version != (inode_peek_iversion_raw(dir) & 0xffffffff)) { + ei->hint_stat.clu = cdir.dir; + ei->hint_stat.eidx = 0; + ei->version = (inode_peek_iversion_raw(dir) & 0xffffffff); + ei->hint_femp.eidx = EXFAT_HINT_NONE; + } + + /* search the file name for directories */ + dentry = exfat_find_dir_entry(sb, ei, &cdir, &uni_name, + num_entries, TYPE_ALL); + + if ((dentry < 0) && (dentry != -EEXIST)) + return dentry; /* -error value */ + + memcpy(&info->dir, &cdir.dir, sizeof(struct exfat_chain)); + info->entry = dentry; + info->num_subdirs = 0; + + /* root directory itself */ + if (unlikely(dentry == -EEXIST)) { + int num_clu = 0; + + info->type = TYPE_DIR; + info->attr = ATTR_SUBDIR; + info->flags = ALLOC_FAT_CHAIN; + info->start_clu = sbi->root_dir; + memset(&info->crtime, 0, sizeof(info->crtime)); + memset(&info->mtime, 0, sizeof(info->mtime)); + memset(&info->atime, 0, sizeof(info->atime)); + + exfat_chain_set(&cdir, sbi->root_dir, 0, ALLOC_FAT_CHAIN); + if (exfat_count_num_clusters(sb, &cdir, &num_clu)) + return -EIO; + info->size = num_clu << sbi->cluster_size_bits; + + count = exfat_count_dir_entries(sb, &cdir); + if (count < 0) + return -EIO; + + info->num_subdirs = count; + } else { + es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES, &ep); + if (!es) + return -EIO; + ep2 = ep + 1; + + info->type = exfat_get_entry_type(ep); + info->attr = le16_to_cpu(ep->dentry.file.attr); + info->size = le64_to_cpu(ep2->dentry.stream.valid_size); + if ((info->type == TYPE_FILE) && (info->size == 0)) { + info->flags = ALLOC_NO_FAT_CHAIN; + info->start_clu = EXFAT_EOF_CLUSTER; + } else { + info->flags = ep2->dentry.stream.flags; + info->start_clu = + le32_to_cpu(ep2->dentry.stream.start_clu); + } + + if (ei->start_clu == EXFAT_FREE_CLUSTER) { + exfat_fs_error(sb, + "non-zero size file starts with zero cluster (size : %llu, p_dir : %u, entry : 0x%08x)", + i_size_read(dir), ei->dir.dir, ei->entry); + return -EIO; + } + + exfat_get_entry_time(sbi, &info->crtime, + ep->dentry.file.create_tz, + ep->dentry.file.create_time, + ep->dentry.file.create_date, + ep->dentry.file.create_time_ms); + exfat_get_entry_time(sbi, &info->mtime, + ep->dentry.file.modify_tz, + ep->dentry.file.modify_time, + ep->dentry.file.modify_date, + ep->dentry.file.modify_time_ms); + exfat_get_entry_time(sbi, &info->atime, + ep->dentry.file.access_tz, + ep->dentry.file.access_time, + ep->dentry.file.access_date, + 0); + kfree(es); + + if (info->type == TYPE_DIR) { + exfat_chain_set(&cdir, info->start_clu, + EXFAT_B_TO_CLU(info->size, sbi), info->flags); + count = exfat_count_dir_entries(sb, &cdir); + if (count < 0) + return -EIO; + + info->num_subdirs = count + EXFAT_MIN_SUBDIR; + } + } + return 0; +} + +static int exfat_d_anon_disconn(struct dentry *dentry) +{ + return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED); +} + +static struct dentry *exfat_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + struct dentry *alias; + struct exfat_dir_entry info; + int err; + loff_t i_pos; + mode_t i_mode; + + mutex_lock(&EXFAT_SB(sb)->s_lock); + err = exfat_find(dir, &dentry->d_name, &info); + if (err) { + if (err == -ENOENT) { + inode = NULL; + goto out; + } + goto unlock; + } + + i_pos = exfat_make_i_pos(&info); + inode = exfat_build_inode(sb, &info, i_pos); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto unlock; + } + + i_mode = inode->i_mode; + alias = d_find_alias(inode); + + /* + * Checking "alias->d_parent == dentry->d_parent" to make sure + * FS is not corrupted (especially double linked dir). + */ + if (alias && alias->d_parent == dentry->d_parent && + !exfat_d_anon_disconn(alias)) { + + /* + * Unhashed alias is able to exist because of revalidate() + * called by lookup_fast. You can easily make this status + * by calling create and lookup concurrently + * In such case, we reuse an alias instead of new dentry + */ + if (d_unhashed(alias)) { + WARN_ON(alias->d_name.hash_len != + dentry->d_name.hash_len); + exfat_msg(sb, KERN_INFO, + "rehashed a dentry(%p) in read lookup", alias); + d_drop(dentry); + d_rehash(alias); + } else if (!S_ISDIR(i_mode)) { + /* + * This inode has non anonymous-DCACHE_DISCONNECTED + * dentry. This means, the user did ->lookup() by an + * another name (longname vs 8.3 alias of it) in past. + * + * Switch to new one for reason of locality if possible. + */ + d_move(alias, dentry); + } + iput(inode); + mutex_unlock(&EXFAT_SB(sb)->s_lock); + return alias; + } + dput(alias); +out: + mutex_unlock(&EXFAT_SB(sb)->s_lock); + if (!inode) + exfat_d_version_set(dentry, inode_query_iversion(dir)); + + return d_splice_alias(inode, dentry); +unlock: + mutex_unlock(&EXFAT_SB(sb)->s_lock); + return ERR_PTR(err); +} + +/* remove an entry, BUT don't truncate */ +static int exfat_unlink(struct inode *dir, struct dentry *dentry) +{ + struct exfat_chain cdir; + struct exfat_dentry *ep; + struct super_block *sb = dir->i_sb; + struct inode *inode = dentry->d_inode; + struct exfat_inode_info *ei = EXFAT_I(inode); + struct buffer_head *bh; + sector_t sector; + int num_entries, entry, err = 0; + + mutex_lock(&EXFAT_SB(sb)->s_lock); + exfat_chain_dup(&cdir, &ei->dir); + entry = ei->entry; + if (ei->dir.dir == DIR_DELETED) { + exfat_msg(sb, KERN_ERR, "abnormal access to deleted dentry"); + err = -ENOENT; + goto unlock; + } + + ep = exfat_get_dentry(sb, &cdir, entry, &bh, §or); + if (!ep) { + err = -EIO; + goto unlock; + } + num_entries = exfat_count_ext_entries(sb, &cdir, entry, ep); + if (num_entries < 0) { + err = -EIO; + brelse(bh); + goto unlock; + } + num_entries++; + brelse(bh); + + exfat_set_vol_flags(sb, VOL_DIRTY); + /* update the directory entry */ + if (exfat_remove_entries(dir, &cdir, entry, 0, num_entries)) { + err = -EIO; + goto unlock; + } + + /* This doesn't modify ei */ + ei->dir.dir = DIR_DELETED; + exfat_set_vol_flags(sb, VOL_CLEAN); + + inode_inc_iversion(dir); + dir->i_mtime = dir->i_atime = current_time(dir); + if (IS_DIRSYNC(dir)) + exfat_sync_inode(dir); + else + mark_inode_dirty(dir); + + clear_nlink(inode); + inode->i_mtime = inode->i_atime = current_time(inode); + exfat_unhash_inode(inode); + exfat_d_version_set(dentry, inode_query_iversion(dir)); +unlock: + mutex_unlock(&EXFAT_SB(sb)->s_lock); + return err; +} + +static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct super_block *sb = dir->i_sb; + struct inode *inode; + struct exfat_dir_entry info; + struct exfat_chain cdir; + loff_t i_pos; + int err; + + mutex_lock(&EXFAT_SB(sb)->s_lock); + exfat_set_vol_flags(sb, VOL_DIRTY); + err = exfat_add_entry(dir, dentry->d_name.name, &cdir, TYPE_DIR, + &info); + exfat_set_vol_flags(sb, VOL_CLEAN); + if (err) + goto unlock; + + inode_inc_iversion(dir); + dir->i_ctime = dir->i_mtime = current_time(dir); + if (IS_DIRSYNC(dir)) + exfat_sync_inode(dir); + else + mark_inode_dirty(dir); + inc_nlink(dir); + + i_pos = exfat_make_i_pos(&info); + inode = exfat_build_inode(sb, &info, i_pos); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto unlock; + } + + inode_inc_iversion(inode); + inode->i_mtime = inode->i_atime = inode->i_ctime = + EXFAT_I(inode)->i_crtime = current_time(inode); + /* timestamp is already written, so mark_inode_dirty() is unneeded. */ + + d_instantiate(dentry, inode); + +unlock: + mutex_unlock(&EXFAT_SB(sb)->s_lock); + return err; +} + +static int exfat_check_dir_empty(struct super_block *sb, + struct exfat_chain *p_dir) +{ + int i, dentries_per_clu; + unsigned int type; + struct exfat_chain clu; + struct exfat_dentry *ep; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct buffer_head *bh; + + dentries_per_clu = sbi->dentries_per_clu; + + exfat_chain_dup(&clu, p_dir); + + while (clu.dir != EXFAT_EOF_CLUSTER) { + for (i = 0; i < dentries_per_clu; i++) { + ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + if (!ep) + return -EIO; + type = exfat_get_entry_type(ep); + brelse(bh); + if (type == TYPE_UNUSED) + return 0; + + if (type != TYPE_FILE && type != TYPE_DIR) + continue; + + return -ENOTEMPTY; + } + + if (clu.flags == ALLOC_NO_FAT_CHAIN) { + if (--clu.size > 0) + clu.dir++; + else + clu.dir = EXFAT_EOF_CLUSTER; + } else { + if (exfat_get_next_cluster(sb, &(clu.dir))) + return -EIO; + } + } + + return 0; +} + +static int exfat_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + struct exfat_dentry *ep; + struct exfat_chain cdir, clu_to_free; + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(inode); + struct buffer_head *bh; + sector_t sector; + int num_entries, entry, err; + + mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock); + + exfat_chain_dup(&cdir, &ei->dir); + entry = ei->entry; + + if (ei->dir.dir == DIR_DELETED) { + exfat_msg(sb, KERN_ERR, "abnormal access to deleted dentry"); + err = -ENOENT; + goto unlock; + } + + exfat_set_vol_flags(sb, VOL_DIRTY); + exfat_chain_set(&clu_to_free, ei->start_clu, + EXFAT_B_TO_CLU_ROUND_UP(i_size_read(inode), sbi), ei->flags); + + err = exfat_check_dir_empty(sb, &clu_to_free); + if (err) { + if (err == -EIO) + exfat_msg(sb, KERN_ERR, + "failed to exfat_check_dir_empty : err(%d)", + err); + goto unlock; + } + + ep = exfat_get_dentry(sb, &cdir, entry, &bh, §or); + if (!ep) { + err = -EIO; + goto unlock; + } + + num_entries = exfat_count_ext_entries(sb, &cdir, entry, ep); + if (num_entries < 0) { + err = -EIO; + brelse(bh); + goto unlock; + } + num_entries++; + brelse(bh); + + err = exfat_remove_entries(dir, &cdir, entry, 0, num_entries); + if (err) { + exfat_msg(sb, KERN_ERR, + "failed to exfat_remove_entries : err(%d)", + err); + goto unlock; + } + ei->dir.dir = DIR_DELETED; + exfat_set_vol_flags(sb, VOL_CLEAN); + + inode_inc_iversion(dir); + dir->i_mtime = dir->i_atime = current_time(dir); + if (IS_DIRSYNC(dir)) + exfat_sync_inode(dir); + else + mark_inode_dirty(dir); + drop_nlink(dir); + + clear_nlink(inode); + inode->i_mtime = inode->i_atime = current_time(inode); + exfat_unhash_inode(inode); + exfat_d_version_set(dentry, inode_query_iversion(dir)); +unlock: + mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock); + return err; +} + +static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir, + int oldentry, struct exfat_uni_name *p_uniname, + struct exfat_inode_info *ei) +{ + int ret, num_old_entries, num_new_entries; + sector_t sector_old, sector_new; + struct exfat_dentry *epold, *epnew; + struct super_block *sb = inode->i_sb; + struct buffer_head *new_bh, *old_bh; + int sync = IS_DIRSYNC(inode); + + epold = exfat_get_dentry(sb, p_dir, oldentry, &old_bh, §or_old); + if (!epold) + return -EIO; + + num_old_entries = exfat_count_ext_entries(sb, p_dir, oldentry, epold); + if (num_old_entries < 0) + return -EIO; + num_old_entries++; + + num_new_entries = exfat_calc_num_entries(p_uniname); + if (num_new_entries < 0) + return num_new_entries; + + if (num_old_entries < num_new_entries) { + int newentry; + + newentry = + exfat_find_empty_entry(inode, p_dir, num_new_entries); + if (newentry < 0) + return newentry; /* -EIO or -ENOSPC */ + + epnew = exfat_get_dentry(sb, p_dir, newentry, &new_bh, + §or_new); + if (!epnew) + return -EIO; + + memcpy(epnew, epold, DENTRY_SIZE); + if (exfat_get_entry_type(epnew) == TYPE_FILE) { + epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE); + ei->attr |= ATTR_ARCHIVE; + } + exfat_update_bh(sb, new_bh, sync); + brelse(old_bh); + brelse(new_bh); + + epold = exfat_get_dentry(sb, p_dir, oldentry + 1, &old_bh, + §or_old); + epnew = exfat_get_dentry(sb, p_dir, newentry + 1, &new_bh, + §or_new); + if (!epold || !epnew) + return -EIO; + + memcpy(epnew, epold, DENTRY_SIZE); + exfat_update_bh(sb, new_bh, sync); + brelse(old_bh); + brelse(new_bh); + + ret = exfat_init_ext_entry(inode, p_dir, newentry, + num_new_entries, p_uniname); + if (ret) + return ret; + + exfat_remove_entries(inode, p_dir, oldentry, 0, + num_old_entries); + ei->entry = newentry; + } else { + if (exfat_get_entry_type(epold) == TYPE_FILE) { + epold->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE); + ei->attr |= ATTR_ARCHIVE; + } + exfat_update_bh(sb, old_bh, sync); + brelse(old_bh); + ret = exfat_init_ext_entry(inode, p_dir, oldentry, + num_new_entries, p_uniname); + if (ret) + return ret; + + exfat_remove_entries(inode, p_dir, oldentry, num_new_entries, + num_old_entries); + } + return 0; +} + +static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir, + int oldentry, struct exfat_chain *p_newdir, + struct exfat_uni_name *p_uniname, struct exfat_inode_info *ei) +{ + int ret, newentry, num_new_entries, num_old_entries; + sector_t sector_mov, sector_new; + struct exfat_dentry *epmov, *epnew; + struct super_block *sb = inode->i_sb; + struct buffer_head *mov_bh, *new_bh; + + epmov = exfat_get_dentry(sb, p_olddir, oldentry, &mov_bh, §or_mov); + if (!epmov) + return -EIO; + + /* check if the source and target directory is the same */ + if (exfat_get_entry_type(epmov) == TYPE_DIR && + le32_to_cpu(epmov->dentry.stream.start_clu) == p_newdir->dir) + return -EINVAL; + + num_old_entries = exfat_count_ext_entries(sb, p_olddir, oldentry, + epmov); + if (num_old_entries < 0) + return -EIO; + num_old_entries++; + + num_new_entries = exfat_calc_num_entries(p_uniname); + if (num_new_entries < 0) + return num_new_entries; + + newentry = exfat_find_empty_entry(inode, p_newdir, num_new_entries); + if (newentry < 0) + return newentry; /* -EIO or -ENOSPC */ + + epnew = exfat_get_dentry(sb, p_newdir, newentry, &new_bh, §or_new); + if (!epnew) + return -EIO; + + memcpy(epnew, epmov, DENTRY_SIZE); + if (exfat_get_entry_type(epnew) == TYPE_FILE) { + epnew->dentry.file.attr |= cpu_to_le16(ATTR_ARCHIVE); + ei->attr |= ATTR_ARCHIVE; + } + exfat_update_bh(sb, new_bh, IS_DIRSYNC(inode)); + brelse(mov_bh); + brelse(new_bh); + + epmov = exfat_get_dentry(sb, p_olddir, oldentry + 1, &mov_bh, + §or_mov); + epnew = exfat_get_dentry(sb, p_newdir, newentry + 1, &new_bh, + §or_new); + if (!epmov || !epnew) + return -EIO; + + memcpy(epnew, epmov, DENTRY_SIZE); + exfat_update_bh(sb, new_bh, IS_DIRSYNC(inode)); + brelse(mov_bh); + brelse(new_bh); + + ret = exfat_init_ext_entry(inode, p_newdir, newentry, num_new_entries, + p_uniname); + if (ret) + return ret; + + exfat_remove_entries(inode, p_olddir, oldentry, 0, num_old_entries); + + exfat_chain_set(&ei->dir, p_newdir->dir, p_newdir->size, + p_newdir->flags); + + ei->entry = newentry; + return 0; +} + +static void exfat_update_parent_info(struct exfat_inode_info *ei, + struct inode *parent_inode) +{ + struct exfat_sb_info *sbi = EXFAT_SB(parent_inode->i_sb); + struct exfat_inode_info *parent_ei = EXFAT_I(parent_inode); + loff_t parent_isize = i_size_read(parent_inode); + + /* + * the problem that struct exfat_inode_info caches wrong parent info. + * + * because of flag-mismatch of ei->dir, + * there is abnormal traversing cluster chain. + */ + if (unlikely(parent_ei->flags != ei->dir.flags || + parent_isize != EXFAT_CLU_TO_B(ei->dir.size, sbi) || + parent_ei->start_clu != ei->dir.dir)) { + exfat_chain_set(&ei->dir, parent_ei->start_clu, + EXFAT_B_TO_CLU_ROUND_UP(parent_isize, sbi), + parent_ei->flags); + } +} + +/* rename or move a old file into a new file */ +static int __exfat_rename(struct inode *old_parent_inode, + struct exfat_inode_info *ei, struct inode *new_parent_inode, + struct dentry *new_dentry) +{ + int ret; + int dentry; + struct exfat_chain olddir, newdir; + struct exfat_chain *p_dir = NULL; + struct exfat_uni_name uni_name; + struct exfat_dentry *ep; + struct super_block *sb = old_parent_inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + const unsigned char *new_path = new_dentry->d_name.name; + struct inode *new_inode = new_dentry->d_inode; + int num_entries; + struct exfat_inode_info *new_ei = NULL; + unsigned int new_entry_type = TYPE_UNUSED; + int new_entry = 0; + struct buffer_head *old_bh, *new_bh = NULL; + + /* check the validity of pointer parameters */ + if (new_path == NULL || strlen(new_path) == 0) + return -EINVAL; + + if (ei->dir.dir == DIR_DELETED) { + exfat_msg(sb, KERN_ERR, + "abnormal access to deleted source dentry"); + return -ENOENT; + } + + exfat_update_parent_info(ei, old_parent_inode); + + exfat_chain_dup(&olddir, &ei->dir); + dentry = ei->entry; + + ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh, NULL); + if (!ep) { + ret = -EIO; + goto out; + } + brelse(old_bh); + + /* check whether new dir is existing directory and empty */ + if (new_inode) { + ret = -EIO; + new_ei = EXFAT_I(new_inode); + + if (new_ei->dir.dir == DIR_DELETED) { + exfat_msg(sb, KERN_ERR, + "abnormal access to deleted target dentry"); + goto out; + } + + exfat_update_parent_info(new_ei, new_parent_inode); + + p_dir = &(new_ei->dir); + new_entry = new_ei->entry; + ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh, NULL); + if (!ep) + goto out; + + new_entry_type = exfat_get_entry_type(ep); + brelse(new_bh); + + /* if new_inode exists, update ei */ + if (new_entry_type == TYPE_DIR) { + struct exfat_chain new_clu; + + new_clu.dir = new_ei->start_clu; + new_clu.size = + EXFAT_B_TO_CLU_ROUND_UP(i_size_read(new_inode), + sbi); + new_clu.flags = new_ei->flags; + + ret = exfat_check_dir_empty(sb, &new_clu); + if (ret) + goto out; + } + } + + /* check the validity of directory name in the given new pathname */ + ret = exfat_resolve_path(new_parent_inode, new_path, &newdir, + &uni_name); + if (ret) + goto out; + + exfat_set_vol_flags(sb, VOL_DIRTY); + + if (olddir.dir == newdir.dir) + ret = exfat_rename_file(new_parent_inode, &olddir, dentry, + &uni_name, ei); + else + ret = exfat_move_file(new_parent_inode, &olddir, dentry, + &newdir, &uni_name, ei); + + if (!ret && new_inode) { + /* delete entries of new_dir */ + ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh, NULL); + if (!ep) { + ret = -EIO; + goto del_out; + } + + num_entries = exfat_count_ext_entries(sb, p_dir, new_entry, ep); + if (num_entries < 0) { + ret = -EIO; + goto del_out; + } + brelse(new_bh); + + if (exfat_remove_entries(new_inode, p_dir, new_entry, 0, + num_entries + 1)) { + ret = -EIO; + goto del_out; + } + + /* Free the clusters if new_inode is a dir(as if exfat_rmdir) */ + if (new_entry_type == TYPE_DIR) { + /* new_ei, new_clu_to_free */ + struct exfat_chain new_clu_to_free; + + exfat_chain_set(&new_clu_to_free, new_ei->start_clu, + EXFAT_B_TO_CLU_ROUND_UP(i_size_read(new_inode), + sbi), new_ei->flags); + + if (exfat_free_cluster(new_inode, &new_clu_to_free)) { + /* just set I/O error only */ + ret = -EIO; + } + + i_size_write(new_inode, 0); + new_ei->start_clu = EXFAT_EOF_CLUSTER; + new_ei->flags = ALLOC_NO_FAT_CHAIN; + } +del_out: + /* Update new_inode ei + * Prevent syncing removed new_inode + * (new_ei is already initialized above code ("if (new_inode)") + */ + new_ei->dir.dir = DIR_DELETED; + } + exfat_set_vol_flags(sb, VOL_CLEAN); +out: + return ret; +} + +static int exfat_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + struct inode *old_inode, *new_inode; + struct super_block *sb = old_dir->i_sb; + loff_t i_pos; + int err; + + /* + * The VFS already checks for existence, so for local filesystems + * the RENAME_NOREPLACE implementation is equivalent to plain rename. + * Don't support any other flags + */ + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + + mutex_lock(&EXFAT_SB(sb)->s_lock); + old_inode = old_dentry->d_inode; + new_inode = new_dentry->d_inode; + + err = __exfat_rename(old_dir, EXFAT_I(old_inode), new_dir, new_dentry); + if (err) + goto unlock; + + inode_inc_iversion(new_dir); + new_dir->i_ctime = new_dir->i_mtime = new_dir->i_atime = + EXFAT_I(new_dir)->i_crtime = current_time(new_dir); + if (IS_DIRSYNC(new_dir)) + exfat_sync_inode(new_dir); + else + mark_inode_dirty(new_dir); + + i_pos = ((loff_t)EXFAT_I(old_inode)->dir.dir << 32) | + (EXFAT_I(old_inode)->entry & 0xffffffff); + exfat_unhash_inode(old_inode); + exfat_hash_inode(old_inode, i_pos); + if (IS_DIRSYNC(new_dir)) + exfat_sync_inode(old_inode); + else + mark_inode_dirty(old_inode); + + if (S_ISDIR(old_inode->i_mode) && old_dir != new_dir) { + drop_nlink(old_dir); + if (!new_inode) + inc_nlink(new_dir); + } + + inode_inc_iversion(old_dir); + old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); + if (IS_DIRSYNC(old_dir)) + exfat_sync_inode(old_dir); + else + mark_inode_dirty(old_dir); + + if (new_inode) { + exfat_unhash_inode(new_inode); + + /* skip drop_nlink if new_inode already has been dropped */ + if (new_inode->i_nlink) { + drop_nlink(new_inode); + if (S_ISDIR(new_inode->i_mode)) + drop_nlink(new_inode); + } else { + exfat_msg(sb, KERN_WARNING, + "abnormal access to an inode dropped"); + WARN_ON(new_inode->i_nlink == 0); + } + new_inode->i_ctime = EXFAT_I(new_inode)->i_crtime = + current_time(new_inode); + } + +unlock: + mutex_unlock(&EXFAT_SB(sb)->s_lock); + return err; +} + +const struct inode_operations exfat_dir_inode_operations = { + .create = exfat_create, + .lookup = exfat_lookup, + .unlink = exfat_unlink, + .mkdir = exfat_mkdir, + .rmdir = exfat_rmdir, + .rename = exfat_rename, + .setattr = exfat_setattr, + .getattr = exfat_getattr, +}; diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c new file mode 100644 index 000000000000..6d1c3ae130ff --- /dev/null +++ b/fs/exfat/nls.c @@ -0,0 +1,831 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/buffer_head.h> +#include <asm/unaligned.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +/* Upcase tabel macro */ +#define EXFAT_NUM_UPCASE (2918) +#define UTBL_COUNT (0x10000) + +/* + * Upcase table in compressed format (7.2.5.1 Recommended Up-case Table + * in exfat specification, See: + * https://docs.microsoft.com/en-us/windows/win32/fileio/exfat-specification). + */ +static const unsigned short uni_def_upcase[EXFAT_NUM_UPCASE] = { + 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, + 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, + 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, + 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, + 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, + 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, + 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, + 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, + 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, + 0x0060, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, + 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, + 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, + 0x0058, 0x0059, 0x005a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, + 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, + 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, + 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, + 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, + 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, + 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, + 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, + 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, + 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, + 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, + 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, + 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, + 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, + 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00f7, + 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x0178, + 0x0100, 0x0100, 0x0102, 0x0102, 0x0104, 0x0104, 0x0106, 0x0106, + 0x0108, 0x0108, 0x010a, 0x010a, 0x010c, 0x010c, 0x010e, 0x010e, + 0x0110, 0x0110, 0x0112, 0x0112, 0x0114, 0x0114, 0x0116, 0x0116, + 0x0118, 0x0118, 0x011a, 0x011a, 0x011c, 0x011c, 0x011e, 0x011e, + 0x0120, 0x0120, 0x0122, 0x0122, 0x0124, 0x0124, 0x0126, 0x0126, + 0x0128, 0x0128, 0x012a, 0x012a, 0x012c, 0x012c, 0x012e, 0x012e, + 0x0130, 0x0131, 0x0132, 0x0132, 0x0134, 0x0134, 0x0136, 0x0136, + 0x0138, 0x0139, 0x0139, 0x013b, 0x013b, 0x013d, 0x013d, 0x013f, + 0x013f, 0x0141, 0x0141, 0x0143, 0x0143, 0x0145, 0x0145, 0x0147, + 0x0147, 0x0149, 0x014a, 0x014a, 0x014c, 0x014c, 0x014e, 0x014e, + 0x0150, 0x0150, 0x0152, 0x0152, 0x0154, 0x0154, 0x0156, 0x0156, + 0x0158, 0x0158, 0x015a, 0x015a, 0x015c, 0x015c, 0x015e, 0x015e, + 0x0160, 0x0160, 0x0162, 0x0162, 0x0164, 0x0164, 0x0166, 0x0166, + 0x0168, 0x0168, 0x016a, 0x016a, 0x016c, 0x016c, 0x016e, 0x016e, + 0x0170, 0x0170, 0x0172, 0x0172, 0x0174, 0x0174, 0x0176, 0x0176, + 0x0178, 0x0179, 0x0179, 0x017b, 0x017b, 0x017d, 0x017d, 0x017f, + 0x0243, 0x0181, 0x0182, 0x0182, 0x0184, 0x0184, 0x0186, 0x0187, + 0x0187, 0x0189, 0x018a, 0x018b, 0x018b, 0x018d, 0x018e, 0x018f, + 0x0190, 0x0191, 0x0191, 0x0193, 0x0194, 0x01f6, 0x0196, 0x0197, + 0x0198, 0x0198, 0x023d, 0x019b, 0x019c, 0x019d, 0x0220, 0x019f, + 0x01a0, 0x01a0, 0x01a2, 0x01a2, 0x01a4, 0x01a4, 0x01a6, 0x01a7, + 0x01a7, 0x01a9, 0x01aa, 0x01ab, 0x01ac, 0x01ac, 0x01ae, 0x01af, + 0x01af, 0x01b1, 0x01b2, 0x01b3, 0x01b3, 0x01b5, 0x01b5, 0x01b7, + 0x01b8, 0x01b8, 0x01ba, 0x01bb, 0x01bc, 0x01bc, 0x01be, 0x01f7, + 0x01c0, 0x01c1, 0x01c2, 0x01c3, 0x01c4, 0x01c5, 0x01c4, 0x01c7, + 0x01c8, 0x01c7, 0x01ca, 0x01cb, 0x01ca, 0x01cd, 0x01cd, 0x01cf, + 0x01cf, 0x01d1, 0x01d1, 0x01d3, 0x01d3, 0x01d5, 0x01d5, 0x01d7, + 0x01d7, 0x01d9, 0x01d9, 0x01db, 0x01db, 0x018e, 0x01de, 0x01de, + 0x01e0, 0x01e0, 0x01e2, 0x01e2, 0x01e4, 0x01e4, 0x01e6, 0x01e6, + 0x01e8, 0x01e8, 0x01ea, 0x01ea, 0x01ec, 0x01ec, 0x01ee, 0x01ee, + 0x01f0, 0x01f1, 0x01f2, 0x01f1, 0x01f4, 0x01f4, 0x01f6, 0x01f7, + 0x01f8, 0x01f8, 0x01fa, 0x01fa, 0x01fc, 0x01fc, 0x01fe, 0x01fe, + 0x0200, 0x0200, 0x0202, 0x0202, 0x0204, 0x0204, 0x0206, 0x0206, + 0x0208, 0x0208, 0x020a, 0x020a, 0x020c, 0x020c, 0x020e, 0x020e, + 0x0210, 0x0210, 0x0212, 0x0212, 0x0214, 0x0214, 0x0216, 0x0216, + 0x0218, 0x0218, 0x021a, 0x021a, 0x021c, 0x021c, 0x021e, 0x021e, + 0x0220, 0x0221, 0x0222, 0x0222, 0x0224, 0x0224, 0x0226, 0x0226, + 0x0228, 0x0228, 0x022a, 0x022a, 0x022c, 0x022c, 0x022e, 0x022e, + 0x0230, 0x0230, 0x0232, 0x0232, 0x0234, 0x0235, 0x0236, 0x0237, + 0x0238, 0x0239, 0x2c65, 0x023b, 0x023b, 0x023d, 0x2c66, 0x023f, + 0x0240, 0x0241, 0x0241, 0x0243, 0x0244, 0x0245, 0x0246, 0x0246, + 0x0248, 0x0248, 0x024a, 0x024a, 0x024c, 0x024c, 0x024e, 0x024e, + 0x0250, 0x0251, 0x0252, 0x0181, 0x0186, 0x0255, 0x0189, 0x018a, + 0x0258, 0x018f, 0x025a, 0x0190, 0x025c, 0x025d, 0x025e, 0x025f, + 0x0193, 0x0261, 0x0262, 0x0194, 0x0264, 0x0265, 0x0266, 0x0267, + 0x0197, 0x0196, 0x026a, 0x2c62, 0x026c, 0x026d, 0x026e, 0x019c, + 0x0270, 0x0271, 0x019d, 0x0273, 0x0274, 0x019f, 0x0276, 0x0277, + 0x0278, 0x0279, 0x027a, 0x027b, 0x027c, 0x2c64, 0x027e, 0x027f, + 0x01a6, 0x0281, 0x0282, 0x01a9, 0x0284, 0x0285, 0x0286, 0x0287, + 0x01ae, 0x0244, 0x01b1, 0x01b2, 0x0245, 0x028d, 0x028e, 0x028f, + 0x0290, 0x0291, 0x01b7, 0x0293, 0x0294, 0x0295, 0x0296, 0x0297, + 0x0298, 0x0299, 0x029a, 0x029b, 0x029c, 0x029d, 0x029e, 0x029f, + 0x02a0, 0x02a1, 0x02a2, 0x02a3, 0x02a4, 0x02a5, 0x02a6, 0x02a7, + 0x02a8, 0x02a9, 0x02aa, 0x02ab, 0x02ac, 0x02ad, 0x02ae, 0x02af, + 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x02b5, 0x02b6, 0x02b7, + 0x02b8, 0x02b9, 0x02ba, 0x02bb, 0x02bc, 0x02bd, 0x02be, 0x02bf, + 0x02c0, 0x02c1, 0x02c2, 0x02c3, 0x02c4, 0x02c5, 0x02c6, 0x02c7, + 0x02c8, 0x02c9, 0x02ca, 0x02cb, 0x02cc, 0x02cd, 0x02ce, 0x02cf, + 0x02d0, 0x02d1, 0x02d2, 0x02d3, 0x02d4, 0x02d5, 0x02d6, 0x02d7, + 0x02d8, 0x02d9, 0x02da, 0x02db, 0x02dc, 0x02dd, 0x02de, 0x02df, + 0x02e0, 0x02e1, 0x02e2, 0x02e3, 0x02e4, 0x02e5, 0x02e6, 0x02e7, + 0x02e8, 0x02e9, 0x02ea, 0x02eb, 0x02ec, 0x02ed, 0x02ee, 0x02ef, + 0x02f0, 0x02f1, 0x02f2, 0x02f3, 0x02f4, 0x02f5, 0x02f6, 0x02f7, + 0x02f8, 0x02f9, 0x02fa, 0x02fb, 0x02fc, 0x02fd, 0x02fe, 0x02ff, + 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306, 0x0307, + 0x0308, 0x0309, 0x030a, 0x030b, 0x030c, 0x030d, 0x030e, 0x030f, + 0x0310, 0x0311, 0x0312, 0x0313, 0x0314, 0x0315, 0x0316, 0x0317, + 0x0318, 0x0319, 0x031a, 0x031b, 0x031c, 0x031d, 0x031e, 0x031f, + 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326, 0x0327, + 0x0328, 0x0329, 0x032a, 0x032b, 0x032c, 0x032d, 0x032e, 0x032f, + 0x0330, 0x0331, 0x0332, 0x0333, 0x0334, 0x0335, 0x0336, 0x0337, + 0x0338, 0x0339, 0x033a, 0x033b, 0x033c, 0x033d, 0x033e, 0x033f, + 0x0340, 0x0341, 0x0342, 0x0343, 0x0344, 0x0345, 0x0346, 0x0347, + 0x0348, 0x0349, 0x034a, 0x034b, 0x034c, 0x034d, 0x034e, 0x034f, + 0x0350, 0x0351, 0x0352, 0x0353, 0x0354, 0x0355, 0x0356, 0x0357, + 0x0358, 0x0359, 0x035a, 0x035b, 0x035c, 0x035d, 0x035e, 0x035f, + 0x0360, 0x0361, 0x0362, 0x0363, 0x0364, 0x0365, 0x0366, 0x0367, + 0x0368, 0x0369, 0x036a, 0x036b, 0x036c, 0x036d, 0x036e, 0x036f, + 0x0370, 0x0371, 0x0372, 0x0373, 0x0374, 0x0375, 0x0376, 0x0377, + 0x0378, 0x0379, 0x037a, 0x03fd, 0x03fe, 0x03ff, 0x037e, 0x037f, + 0x0380, 0x0381, 0x0382, 0x0383, 0x0384, 0x0385, 0x0386, 0x0387, + 0x0388, 0x0389, 0x038a, 0x038b, 0x038c, 0x038d, 0x038e, 0x038f, + 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, + 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, + 0x03a0, 0x03a1, 0x03a2, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, + 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x0386, 0x0388, 0x0389, 0x038a, + 0x03b0, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, + 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, + 0x03a0, 0x03a1, 0x03a3, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, + 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x038c, 0x038e, 0x038f, 0x03cf, + 0x03d0, 0x03d1, 0x03d2, 0x03d3, 0x03d4, 0x03d5, 0x03d6, 0x03d7, + 0x03d8, 0x03d8, 0x03da, 0x03da, 0x03dc, 0x03dc, 0x03de, 0x03de, + 0x03e0, 0x03e0, 0x03e2, 0x03e2, 0x03e4, 0x03e4, 0x03e6, 0x03e6, + 0x03e8, 0x03e8, 0x03ea, 0x03ea, 0x03ec, 0x03ec, 0x03ee, 0x03ee, + 0x03f0, 0x03f1, 0x03f9, 0x03f3, 0x03f4, 0x03f5, 0x03f6, 0x03f7, + 0x03f7, 0x03f9, 0x03fa, 0x03fa, 0x03fc, 0x03fd, 0x03fe, 0x03ff, + 0x0400, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, + 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f, + 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, + 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, + 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, + 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, + 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, + 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, + 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, + 0x0400, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, + 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f, + 0x0460, 0x0460, 0x0462, 0x0462, 0x0464, 0x0464, 0x0466, 0x0466, + 0x0468, 0x0468, 0x046a, 0x046a, 0x046c, 0x046c, 0x046e, 0x046e, + 0x0470, 0x0470, 0x0472, 0x0472, 0x0474, 0x0474, 0x0476, 0x0476, + 0x0478, 0x0478, 0x047a, 0x047a, 0x047c, 0x047c, 0x047e, 0x047e, + 0x0480, 0x0480, 0x0482, 0x0483, 0x0484, 0x0485, 0x0486, 0x0487, + 0x0488, 0x0489, 0x048a, 0x048a, 0x048c, 0x048c, 0x048e, 0x048e, + 0x0490, 0x0490, 0x0492, 0x0492, 0x0494, 0x0494, 0x0496, 0x0496, + 0x0498, 0x0498, 0x049a, 0x049a, 0x049c, 0x049c, 0x049e, 0x049e, + 0x04a0, 0x04a0, 0x04a2, 0x04a2, 0x04a4, 0x04a4, 0x04a6, 0x04a6, + 0x04a8, 0x04a8, 0x04aa, 0x04aa, 0x04ac, 0x04ac, 0x04ae, 0x04ae, + 0x04b0, 0x04b0, 0x04b2, 0x04b2, 0x04b4, 0x04b4, 0x04b6, 0x04b6, + 0x04b8, 0x04b8, 0x04ba, 0x04ba, 0x04bc, 0x04bc, 0x04be, 0x04be, + 0x04c0, 0x04c1, 0x04c1, 0x04c3, 0x04c3, 0x04c5, 0x04c5, 0x04c7, + 0x04c7, 0x04c9, 0x04c9, 0x04cb, 0x04cb, 0x04cd, 0x04cd, 0x04c0, + 0x04d0, 0x04d0, 0x04d2, 0x04d2, 0x04d4, 0x04d4, 0x04d6, 0x04d6, + 0x04d8, 0x04d8, 0x04da, 0x04da, 0x04dc, 0x04dc, 0x04de, 0x04de, + 0x04e0, 0x04e0, 0x04e2, 0x04e2, 0x04e4, 0x04e4, 0x04e6, 0x04e6, + 0x04e8, 0x04e8, 0x04ea, 0x04ea, 0x04ec, 0x04ec, 0x04ee, 0x04ee, + 0x04f0, 0x04f0, 0x04f2, 0x04f2, 0x04f4, 0x04f4, 0x04f6, 0x04f6, + 0x04f8, 0x04f8, 0x04fa, 0x04fa, 0x04fc, 0x04fc, 0x04fe, 0x04fe, + 0x0500, 0x0500, 0x0502, 0x0502, 0x0504, 0x0504, 0x0506, 0x0506, + 0x0508, 0x0508, 0x050a, 0x050a, 0x050c, 0x050c, 0x050e, 0x050e, + 0x0510, 0x0510, 0x0512, 0x0512, 0x0514, 0x0515, 0x0516, 0x0517, + 0x0518, 0x0519, 0x051a, 0x051b, 0x051c, 0x051d, 0x051e, 0x051f, + 0x0520, 0x0521, 0x0522, 0x0523, 0x0524, 0x0525, 0x0526, 0x0527, + 0x0528, 0x0529, 0x052a, 0x052b, 0x052c, 0x052d, 0x052e, 0x052f, + 0x0530, 0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537, + 0x0538, 0x0539, 0x053a, 0x053b, 0x053c, 0x053d, 0x053e, 0x053f, + 0x0540, 0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, 0x0547, + 0x0548, 0x0549, 0x054a, 0x054b, 0x054c, 0x054d, 0x054e, 0x054f, + 0x0550, 0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556, 0x0557, + 0x0558, 0x0559, 0x055a, 0x055b, 0x055c, 0x055d, 0x055e, 0x055f, + 0x0560, 0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537, + 0x0538, 0x0539, 0x053a, 0x053b, 0x053c, 0x053d, 0x053e, 0x053f, + 0x0540, 0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, 0x0547, + 0x0548, 0x0549, 0x054a, 0x054b, 0x054c, 0x054d, 0x054e, 0x054f, + 0x0550, 0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556, 0xffff, + 0x17f6, 0x2c63, 0x1d7e, 0x1d7f, 0x1d80, 0x1d81, 0x1d82, 0x1d83, + 0x1d84, 0x1d85, 0x1d86, 0x1d87, 0x1d88, 0x1d89, 0x1d8a, 0x1d8b, + 0x1d8c, 0x1d8d, 0x1d8e, 0x1d8f, 0x1d90, 0x1d91, 0x1d92, 0x1d93, + 0x1d94, 0x1d95, 0x1d96, 0x1d97, 0x1d98, 0x1d99, 0x1d9a, 0x1d9b, + 0x1d9c, 0x1d9d, 0x1d9e, 0x1d9f, 0x1da0, 0x1da1, 0x1da2, 0x1da3, + 0x1da4, 0x1da5, 0x1da6, 0x1da7, 0x1da8, 0x1da9, 0x1daa, 0x1dab, + 0x1dac, 0x1dad, 0x1dae, 0x1daf, 0x1db0, 0x1db1, 0x1db2, 0x1db3, + 0x1db4, 0x1db5, 0x1db6, 0x1db7, 0x1db8, 0x1db9, 0x1dba, 0x1dbb, + 0x1dbc, 0x1dbd, 0x1dbe, 0x1dbf, 0x1dc0, 0x1dc1, 0x1dc2, 0x1dc3, + 0x1dc4, 0x1dc5, 0x1dc6, 0x1dc7, 0x1dc8, 0x1dc9, 0x1dca, 0x1dcb, + 0x1dcc, 0x1dcd, 0x1dce, 0x1dcf, 0x1dd0, 0x1dd1, 0x1dd2, 0x1dd3, + 0x1dd4, 0x1dd5, 0x1dd6, 0x1dd7, 0x1dd8, 0x1dd9, 0x1dda, 0x1ddb, + 0x1ddc, 0x1ddd, 0x1dde, 0x1ddf, 0x1de0, 0x1de1, 0x1de2, 0x1de3, + 0x1de4, 0x1de5, 0x1de6, 0x1de7, 0x1de8, 0x1de9, 0x1dea, 0x1deb, + 0x1dec, 0x1ded, 0x1dee, 0x1def, 0x1df0, 0x1df1, 0x1df2, 0x1df3, + 0x1df4, 0x1df5, 0x1df6, 0x1df7, 0x1df8, 0x1df9, 0x1dfa, 0x1dfb, + 0x1dfc, 0x1dfd, 0x1dfe, 0x1dff, 0x1e00, 0x1e00, 0x1e02, 0x1e02, + 0x1e04, 0x1e04, 0x1e06, 0x1e06, 0x1e08, 0x1e08, 0x1e0a, 0x1e0a, + 0x1e0c, 0x1e0c, 0x1e0e, 0x1e0e, 0x1e10, 0x1e10, 0x1e12, 0x1e12, + 0x1e14, 0x1e14, 0x1e16, 0x1e16, 0x1e18, 0x1e18, 0x1e1a, 0x1e1a, + 0x1e1c, 0x1e1c, 0x1e1e, 0x1e1e, 0x1e20, 0x1e20, 0x1e22, 0x1e22, + 0x1e24, 0x1e24, 0x1e26, 0x1e26, 0x1e28, 0x1e28, 0x1e2a, 0x1e2a, + 0x1e2c, 0x1e2c, 0x1e2e, 0x1e2e, 0x1e30, 0x1e30, 0x1e32, 0x1e32, + 0x1e34, 0x1e34, 0x1e36, 0x1e36, 0x1e38, 0x1e38, 0x1e3a, 0x1e3a, + 0x1e3c, 0x1e3c, 0x1e3e, 0x1e3e, 0x1e40, 0x1e40, 0x1e42, 0x1e42, + 0x1e44, 0x1e44, 0x1e46, 0x1e46, 0x1e48, 0x1e48, 0x1e4a, 0x1e4a, + 0x1e4c, 0x1e4c, 0x1e4e, 0x1e4e, 0x1e50, 0x1e50, 0x1e52, 0x1e52, + 0x1e54, 0x1e54, 0x1e56, 0x1e56, 0x1e58, 0x1e58, 0x1e5a, 0x1e5a, + 0x1e5c, 0x1e5c, 0x1e5e, 0x1e5e, 0x1e60, 0x1e60, 0x1e62, 0x1e62, + 0x1e64, 0x1e64, 0x1e66, 0x1e66, 0x1e68, 0x1e68, 0x1e6a, 0x1e6a, + 0x1e6c, 0x1e6c, 0x1e6e, 0x1e6e, 0x1e70, 0x1e70, 0x1e72, 0x1e72, + 0x1e74, 0x1e74, 0x1e76, 0x1e76, 0x1e78, 0x1e78, 0x1e7a, 0x1e7a, + 0x1e7c, 0x1e7c, 0x1e7e, 0x1e7e, 0x1e80, 0x1e80, 0x1e82, 0x1e82, + 0x1e84, 0x1e84, 0x1e86, 0x1e86, 0x1e88, 0x1e88, 0x1e8a, 0x1e8a, + 0x1e8c, 0x1e8c, 0x1e8e, 0x1e8e, 0x1e90, 0x1e90, 0x1e92, 0x1e92, + 0x1e94, 0x1e94, 0x1e96, 0x1e97, 0x1e98, 0x1e99, 0x1e9a, 0x1e9b, + 0x1e9c, 0x1e9d, 0x1e9e, 0x1e9f, 0x1ea0, 0x1ea0, 0x1ea2, 0x1ea2, + 0x1ea4, 0x1ea4, 0x1ea6, 0x1ea6, 0x1ea8, 0x1ea8, 0x1eaa, 0x1eaa, + 0x1eac, 0x1eac, 0x1eae, 0x1eae, 0x1eb0, 0x1eb0, 0x1eb2, 0x1eb2, + 0x1eb4, 0x1eb4, 0x1eb6, 0x1eb6, 0x1eb8, 0x1eb8, 0x1eba, 0x1eba, + 0x1ebc, 0x1ebc, 0x1ebe, 0x1ebe, 0x1ec0, 0x1ec0, 0x1ec2, 0x1ec2, + 0x1ec4, 0x1ec4, 0x1ec6, 0x1ec6, 0x1ec8, 0x1ec8, 0x1eca, 0x1eca, + 0x1ecc, 0x1ecc, 0x1ece, 0x1ece, 0x1ed0, 0x1ed0, 0x1ed2, 0x1ed2, + 0x1ed4, 0x1ed4, 0x1ed6, 0x1ed6, 0x1ed8, 0x1ed8, 0x1eda, 0x1eda, + 0x1edc, 0x1edc, 0x1ede, 0x1ede, 0x1ee0, 0x1ee0, 0x1ee2, 0x1ee2, + 0x1ee4, 0x1ee4, 0x1ee6, 0x1ee6, 0x1ee8, 0x1ee8, 0x1eea, 0x1eea, + 0x1eec, 0x1eec, 0x1eee, 0x1eee, 0x1ef0, 0x1ef0, 0x1ef2, 0x1ef2, + 0x1ef4, 0x1ef4, 0x1ef6, 0x1ef6, 0x1ef8, 0x1ef8, 0x1efa, 0x1efb, + 0x1efc, 0x1efd, 0x1efe, 0x1eff, 0x1f08, 0x1f09, 0x1f0a, 0x1f0b, + 0x1f0c, 0x1f0d, 0x1f0e, 0x1f0f, 0x1f08, 0x1f09, 0x1f0a, 0x1f0b, + 0x1f0c, 0x1f0d, 0x1f0e, 0x1f0f, 0x1f18, 0x1f19, 0x1f1a, 0x1f1b, + 0x1f1c, 0x1f1d, 0x1f16, 0x1f17, 0x1f18, 0x1f19, 0x1f1a, 0x1f1b, + 0x1f1c, 0x1f1d, 0x1f1e, 0x1f1f, 0x1f28, 0x1f29, 0x1f2a, 0x1f2b, + 0x1f2c, 0x1f2d, 0x1f2e, 0x1f2f, 0x1f28, 0x1f29, 0x1f2a, 0x1f2b, + 0x1f2c, 0x1f2d, 0x1f2e, 0x1f2f, 0x1f38, 0x1f39, 0x1f3a, 0x1f3b, + 0x1f3c, 0x1f3d, 0x1f3e, 0x1f3f, 0x1f38, 0x1f39, 0x1f3a, 0x1f3b, + 0x1f3c, 0x1f3d, 0x1f3e, 0x1f3f, 0x1f48, 0x1f49, 0x1f4a, 0x1f4b, + 0x1f4c, 0x1f4d, 0x1f46, 0x1f47, 0x1f48, 0x1f49, 0x1f4a, 0x1f4b, + 0x1f4c, 0x1f4d, 0x1f4e, 0x1f4f, 0x1f50, 0x1f59, 0x1f52, 0x1f5b, + 0x1f54, 0x1f5d, 0x1f56, 0x1f5f, 0x1f58, 0x1f59, 0x1f5a, 0x1f5b, + 0x1f5c, 0x1f5d, 0x1f5e, 0x1f5f, 0x1f68, 0x1f69, 0x1f6a, 0x1f6b, + 0x1f6c, 0x1f6d, 0x1f6e, 0x1f6f, 0x1f68, 0x1f69, 0x1f6a, 0x1f6b, + 0x1f6c, 0x1f6d, 0x1f6e, 0x1f6f, 0x1fba, 0x1fbb, 0x1fc8, 0x1fc9, + 0x1fca, 0x1fcb, 0x1fda, 0x1fdb, 0x1ff8, 0x1ff9, 0x1fea, 0x1feb, + 0x1ffa, 0x1ffb, 0x1f7e, 0x1f7f, 0x1f88, 0x1f89, 0x1f8a, 0x1f8b, + 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f, 0x1f88, 0x1f89, 0x1f8a, 0x1f8b, + 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f, 0x1f98, 0x1f99, 0x1f9a, 0x1f9b, + 0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f, 0x1f98, 0x1f99, 0x1f9a, 0x1f9b, + 0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f, 0x1fa8, 0x1fa9, 0x1faa, 0x1fab, + 0x1fac, 0x1fad, 0x1fae, 0x1faf, 0x1fa8, 0x1fa9, 0x1faa, 0x1fab, + 0x1fac, 0x1fad, 0x1fae, 0x1faf, 0x1fb8, 0x1fb9, 0x1fb2, 0x1fbc, + 0x1fb4, 0x1fb5, 0x1fb6, 0x1fb7, 0x1fb8, 0x1fb9, 0x1fba, 0x1fbb, + 0x1fbc, 0x1fbd, 0x1fbe, 0x1fbf, 0x1fc0, 0x1fc1, 0x1fc2, 0x1fc3, + 0x1fc4, 0x1fc5, 0x1fc6, 0x1fc7, 0x1fc8, 0x1fc9, 0x1fca, 0x1fcb, + 0x1fc3, 0x1fcd, 0x1fce, 0x1fcf, 0x1fd8, 0x1fd9, 0x1fd2, 0x1fd3, + 0x1fd4, 0x1fd5, 0x1fd6, 0x1fd7, 0x1fd8, 0x1fd9, 0x1fda, 0x1fdb, + 0x1fdc, 0x1fdd, 0x1fde, 0x1fdf, 0x1fe8, 0x1fe9, 0x1fe2, 0x1fe3, + 0x1fe4, 0x1fec, 0x1fe6, 0x1fe7, 0x1fe8, 0x1fe9, 0x1fea, 0x1feb, + 0x1fec, 0x1fed, 0x1fee, 0x1fef, 0x1ff0, 0x1ff1, 0x1ff2, 0x1ff3, + 0x1ff4, 0x1ff5, 0x1ff6, 0x1ff7, 0x1ff8, 0x1ff9, 0x1ffa, 0x1ffb, + 0x1ff3, 0x1ffd, 0x1ffe, 0x1fff, 0x2000, 0x2001, 0x2002, 0x2003, + 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200a, 0x200b, + 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2013, + 0x2014, 0x2015, 0x2016, 0x2017, 0x2018, 0x2019, 0x201a, 0x201b, + 0x201c, 0x201d, 0x201e, 0x201f, 0x2020, 0x2021, 0x2022, 0x2023, + 0x2024, 0x2025, 0x2026, 0x2027, 0x2028, 0x2029, 0x202a, 0x202b, + 0x202c, 0x202d, 0x202e, 0x202f, 0x2030, 0x2031, 0x2032, 0x2033, + 0x2034, 0x2035, 0x2036, 0x2037, 0x2038, 0x2039, 0x203a, 0x203b, + 0x203c, 0x203d, 0x203e, 0x203f, 0x2040, 0x2041, 0x2042, 0x2043, + 0x2044, 0x2045, 0x2046, 0x2047, 0x2048, 0x2049, 0x204a, 0x204b, + 0x204c, 0x204d, 0x204e, 0x204f, 0x2050, 0x2051, 0x2052, 0x2053, + 0x2054, 0x2055, 0x2056, 0x2057, 0x2058, 0x2059, 0x205a, 0x205b, + 0x205c, 0x205d, 0x205e, 0x205f, 0x2060, 0x2061, 0x2062, 0x2063, + 0x2064, 0x2065, 0x2066, 0x2067, 0x2068, 0x2069, 0x206a, 0x206b, + 0x206c, 0x206d, 0x206e, 0x206f, 0x2070, 0x2071, 0x2072, 0x2073, + 0x2074, 0x2075, 0x2076, 0x2077, 0x2078, 0x2079, 0x207a, 0x207b, + 0x207c, 0x207d, 0x207e, 0x207f, 0x2080, 0x2081, 0x2082, 0x2083, + 0x2084, 0x2085, 0x2086, 0x2087, 0x2088, 0x2089, 0x208a, 0x208b, + 0x208c, 0x208d, 0x208e, 0x208f, 0x2090, 0x2091, 0x2092, 0x2093, + 0x2094, 0x2095, 0x2096, 0x2097, 0x2098, 0x2099, 0x209a, 0x209b, + 0x209c, 0x209d, 0x209e, 0x209f, 0x20a0, 0x20a1, 0x20a2, 0x20a3, + 0x20a4, 0x20a5, 0x20a6, 0x20a7, 0x20a8, 0x20a9, 0x20aa, 0x20ab, + 0x20ac, 0x20ad, 0x20ae, 0x20af, 0x20b0, 0x20b1, 0x20b2, 0x20b3, + 0x20b4, 0x20b5, 0x20b6, 0x20b7, 0x20b8, 0x20b9, 0x20ba, 0x20bb, + 0x20bc, 0x20bd, 0x20be, 0x20bf, 0x20c0, 0x20c1, 0x20c2, 0x20c3, + 0x20c4, 0x20c5, 0x20c6, 0x20c7, 0x20c8, 0x20c9, 0x20ca, 0x20cb, + 0x20cc, 0x20cd, 0x20ce, 0x20cf, 0x20d0, 0x20d1, 0x20d2, 0x20d3, + 0x20d4, 0x20d5, 0x20d6, 0x20d7, 0x20d8, 0x20d9, 0x20da, 0x20db, + 0x20dc, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x20e1, 0x20e2, 0x20e3, + 0x20e4, 0x20e5, 0x20e6, 0x20e7, 0x20e8, 0x20e9, 0x20ea, 0x20eb, + 0x20ec, 0x20ed, 0x20ee, 0x20ef, 0x20f0, 0x20f1, 0x20f2, 0x20f3, + 0x20f4, 0x20f5, 0x20f6, 0x20f7, 0x20f8, 0x20f9, 0x20fa, 0x20fb, + 0x20fc, 0x20fd, 0x20fe, 0x20ff, 0x2100, 0x2101, 0x2102, 0x2103, + 0x2104, 0x2105, 0x2106, 0x2107, 0x2108, 0x2109, 0x210a, 0x210b, + 0x210c, 0x210d, 0x210e, 0x210f, 0x2110, 0x2111, 0x2112, 0x2113, + 0x2114, 0x2115, 0x2116, 0x2117, 0x2118, 0x2119, 0x211a, 0x211b, + 0x211c, 0x211d, 0x211e, 0x211f, 0x2120, 0x2121, 0x2122, 0x2123, + 0x2124, 0x2125, 0x2126, 0x2127, 0x2128, 0x2129, 0x212a, 0x212b, + 0x212c, 0x212d, 0x212e, 0x212f, 0x2130, 0x2131, 0x2132, 0x2133, + 0x2134, 0x2135, 0x2136, 0x2137, 0x2138, 0x2139, 0x213a, 0x213b, + 0x213c, 0x213d, 0x213e, 0x213f, 0x2140, 0x2141, 0x2142, 0x2143, + 0x2144, 0x2145, 0x2146, 0x2147, 0x2148, 0x2149, 0x214a, 0x214b, + 0x214c, 0x214d, 0x2132, 0x214f, 0x2150, 0x2151, 0x2152, 0x2153, + 0x2154, 0x2155, 0x2156, 0x2157, 0x2158, 0x2159, 0x215a, 0x215b, + 0x215c, 0x215d, 0x215e, 0x215f, 0x2160, 0x2161, 0x2162, 0x2163, + 0x2164, 0x2165, 0x2166, 0x2167, 0x2168, 0x2169, 0x216a, 0x216b, + 0x216c, 0x216d, 0x216e, 0x216f, 0x2160, 0x2161, 0x2162, 0x2163, + 0x2164, 0x2165, 0x2166, 0x2167, 0x2168, 0x2169, 0x216a, 0x216b, + 0x216c, 0x216d, 0x216e, 0x216f, 0x2180, 0x2181, 0x2182, 0x2183, + 0x2183, 0xffff, 0x034b, 0x24b6, 0x24b7, 0x24b8, 0x24b9, 0x24ba, + 0x24bb, 0x24bc, 0x24bd, 0x24be, 0x24bf, 0x24c0, 0x24c1, 0x24c2, + 0x24c3, 0x24c4, 0x24c5, 0x24c6, 0x24c7, 0x24c8, 0x24c9, 0x24ca, + 0x24cb, 0x24cc, 0x24cd, 0x24ce, 0x24cf, 0xffff, 0x0746, 0x2c00, + 0x2c01, 0x2c02, 0x2c03, 0x2c04, 0x2c05, 0x2c06, 0x2c07, 0x2c08, + 0x2c09, 0x2c0a, 0x2c0b, 0x2c0c, 0x2c0d, 0x2c0e, 0x2c0f, 0x2c10, + 0x2c11, 0x2c12, 0x2c13, 0x2c14, 0x2c15, 0x2c16, 0x2c17, 0x2c18, + 0x2c19, 0x2c1a, 0x2c1b, 0x2c1c, 0x2c1d, 0x2c1e, 0x2c1f, 0x2c20, + 0x2c21, 0x2c22, 0x2c23, 0x2c24, 0x2c25, 0x2c26, 0x2c27, 0x2c28, + 0x2c29, 0x2c2a, 0x2c2b, 0x2c2c, 0x2c2d, 0x2c2e, 0x2c5f, 0x2c60, + 0x2c60, 0x2c62, 0x2c63, 0x2c64, 0x2c65, 0x2c66, 0x2c67, 0x2c67, + 0x2c69, 0x2c69, 0x2c6b, 0x2c6b, 0x2c6d, 0x2c6e, 0x2c6f, 0x2c70, + 0x2c71, 0x2c72, 0x2c73, 0x2c74, 0x2c75, 0x2c75, 0x2c77, 0x2c78, + 0x2c79, 0x2c7a, 0x2c7b, 0x2c7c, 0x2c7d, 0x2c7e, 0x2c7f, 0x2c80, + 0x2c80, 0x2c82, 0x2c82, 0x2c84, 0x2c84, 0x2c86, 0x2c86, 0x2c88, + 0x2c88, 0x2c8a, 0x2c8a, 0x2c8c, 0x2c8c, 0x2c8e, 0x2c8e, 0x2c90, + 0x2c90, 0x2c92, 0x2c92, 0x2c94, 0x2c94, 0x2c96, 0x2c96, 0x2c98, + 0x2c98, 0x2c9a, 0x2c9a, 0x2c9c, 0x2c9c, 0x2c9e, 0x2c9e, 0x2ca0, + 0x2ca0, 0x2ca2, 0x2ca2, 0x2ca4, 0x2ca4, 0x2ca6, 0x2ca6, 0x2ca8, + 0x2ca8, 0x2caa, 0x2caa, 0x2cac, 0x2cac, 0x2cae, 0x2cae, 0x2cb0, + 0x2cb0, 0x2cb2, 0x2cb2, 0x2cb4, 0x2cb4, 0x2cb6, 0x2cb6, 0x2cb8, + 0x2cb8, 0x2cba, 0x2cba, 0x2cbc, 0x2cbc, 0x2cbe, 0x2cbe, 0x2cc0, + 0x2cc0, 0x2cc2, 0x2cc2, 0x2cc4, 0x2cc4, 0x2cc6, 0x2cc6, 0x2cc8, + 0x2cc8, 0x2cca, 0x2cca, 0x2ccc, 0x2ccc, 0x2cce, 0x2cce, 0x2cd0, + 0x2cd0, 0x2cd2, 0x2cd2, 0x2cd4, 0x2cd4, 0x2cd6, 0x2cd6, 0x2cd8, + 0x2cd8, 0x2cda, 0x2cda, 0x2cdc, 0x2cdc, 0x2cde, 0x2cde, 0x2ce0, + 0x2ce0, 0x2ce2, 0x2ce2, 0x2ce4, 0x2ce5, 0x2ce6, 0x2ce7, 0x2ce8, + 0x2ce9, 0x2cea, 0x2ceb, 0x2cec, 0x2ced, 0x2cee, 0x2cef, 0x2cf0, + 0x2cf1, 0x2cf2, 0x2cf3, 0x2cf4, 0x2cf5, 0x2cf6, 0x2cf7, 0x2cf8, + 0x2cf9, 0x2cfa, 0x2cfb, 0x2cfc, 0x2cfd, 0x2cfe, 0x2cff, 0x10a0, + 0x10a1, 0x10a2, 0x10a3, 0x10a4, 0x10a5, 0x10a6, 0x10a7, 0x10a8, + 0x10a9, 0x10aa, 0x10ab, 0x10ac, 0x10ad, 0x10ae, 0x10af, 0x10b0, + 0x10b1, 0x10b2, 0x10b3, 0x10b4, 0x10b5, 0x10b6, 0x10b7, 0x10b8, + 0x10b9, 0x10ba, 0x10bb, 0x10bc, 0x10bd, 0x10be, 0x10bf, 0x10c0, + 0x10c1, 0x10c2, 0x10c3, 0x10c4, 0x10c5, 0xffff, 0xd21b, 0xff21, + 0xff22, 0xff23, 0xff24, 0xff25, 0xff26, 0xff27, 0xff28, 0xff29, + 0xff2a, 0xff2b, 0xff2c, 0xff2d, 0xff2e, 0xff2f, 0xff30, 0xff31, + 0xff32, 0xff33, 0xff34, 0xff35, 0xff36, 0xff37, 0xff38, 0xff39, + 0xff3a, 0xff5b, 0xff5c, 0xff5d, 0xff5e, 0xff5f, 0xff60, 0xff61, + 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67, 0xff68, 0xff69, + 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f, 0xff70, 0xff71, + 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77, 0xff78, 0xff79, + 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f, 0xff80, 0xff81, + 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87, 0xff88, 0xff89, + 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, 0xff8f, 0xff90, 0xff91, + 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97, 0xff98, 0xff99, + 0xff9a, 0xff9b, 0xff9c, 0xff9d, 0xff9e, 0xff9f, 0xffa0, 0xffa1, + 0xffa2, 0xffa3, 0xffa4, 0xffa5, 0xffa6, 0xffa7, 0xffa8, 0xffa9, + 0xffaa, 0xffab, 0xffac, 0xffad, 0xffae, 0xffaf, 0xffb0, 0xffb1, + 0xffb2, 0xffb3, 0xffb4, 0xffb5, 0xffb6, 0xffb7, 0xffb8, 0xffb9, + 0xffba, 0xffbb, 0xffbc, 0xffbd, 0xffbe, 0xffbf, 0xffc0, 0xffc1, + 0xffc2, 0xffc3, 0xffc4, 0xffc5, 0xffc6, 0xffc7, 0xffc8, 0xffc9, + 0xffca, 0xffcb, 0xffcc, 0xffcd, 0xffce, 0xffcf, 0xffd0, 0xffd1, + 0xffd2, 0xffd3, 0xffd4, 0xffd5, 0xffd6, 0xffd7, 0xffd8, 0xffd9, + 0xffda, 0xffdb, 0xffdc, 0xffdd, 0xffde, 0xffdf, 0xffe0, 0xffe1, + 0xffe2, 0xffe3, 0xffe4, 0xffe5, 0xffe6, 0xffe7, 0xffe8, 0xffe9, + 0xffea, 0xffeb, 0xffec, 0xffed, 0xffee, 0xffef, 0xfff0, 0xfff1, + 0xfff2, 0xfff3, 0xfff4, 0xfff5, 0xfff6, 0xfff7, 0xfff8, 0xfff9, + 0xfffa, 0xfffb, 0xfffc, 0xfffd, 0xfffe, 0xffff, +}; + +/* + * Allow full-width illegal characters : + * "MS windows 7" supports full-width-invalid-name-characters. + * So we should check half-width-invalid-name-characters(ASCII) only + * for compatibility. + * + * " * / : < > ? \ | + */ +static unsigned short bad_uni_chars[] = { + 0x0022, 0x002A, 0x002F, 0x003A, + 0x003C, 0x003E, 0x003F, 0x005C, 0x007C, + 0 +}; + +static int exfat_convert_char_to_ucs2(struct nls_table *nls, + const unsigned char *ch, int ch_len, unsigned short *ucs2, + int *lossy) +{ + int len; + + *ucs2 = 0x0; + + if (ch[0] < 0x80) { + *ucs2 = ch[0]; + return 1; + } + + len = nls->char2uni(ch, ch_len, ucs2); + if (len < 0) { + /* conversion failed */ + if (lossy != NULL) + *lossy |= NLS_NAME_LOSSY; + *ucs2 = '_'; + return 1; + } + return len; +} + +static int exfat_convert_ucs2_to_char(struct nls_table *nls, + unsigned short ucs2, unsigned char *ch, int *lossy) +{ + int len; + + ch[0] = 0x0; + + if (ucs2 < 0x0080) { + ch[0] = ucs2; + return 1; + } + + len = nls->uni2char(ucs2, ch, MAX_CHARSET_SIZE); + if (len < 0) { + /* conversion failed */ + if (lossy != NULL) + *lossy |= NLS_NAME_LOSSY; + ch[0] = '_'; + return 1; + } + return len; +} + +unsigned short exfat_toupper(struct super_block *sb, unsigned short a) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + return sbi->vol_utbl[a] ? sbi->vol_utbl[a] : a; +} + +static unsigned short *exfat_wstrchr(unsigned short *str, unsigned short wchar) +{ + while (*str) { + if (*(str++) == wchar) + return str; + } + return NULL; +} + +int exfat_uniname_ncmp(struct super_block *sb, unsigned short *a, + unsigned short *b, unsigned int len) +{ + int i; + + for (i = 0; i < len; i++, a++, b++) + if (exfat_toupper(sb, *a) != exfat_toupper(sb, *b)) + return 1; + return 0; +} + +static int exfat_utf16_to_utf8(struct super_block *sb, + struct exfat_uni_name *p_uniname, unsigned char *p_cstring, + int buflen) +{ + int len; + const unsigned short *uniname = p_uniname->name; + + /* always len >= 0 */ + len = utf16s_to_utf8s(uniname, MAX_NAME_LENGTH, UTF16_HOST_ENDIAN, + p_cstring, buflen); + p_cstring[len] = '\0'; + return len; +} + +static int exfat_utf8_to_utf16(struct super_block *sb, + const unsigned char *p_cstring, const int len, + struct exfat_uni_name *p_uniname, int *p_lossy) +{ + int i, unilen, lossy = NLS_NAME_NO_LOSSY; + unsigned short upname[MAX_NAME_LENGTH + 1]; + unsigned short *uniname = p_uniname->name; + + WARN_ON(!len); + + unilen = utf8s_to_utf16s(p_cstring, len, UTF16_HOST_ENDIAN, + (wchar_t *)uniname, MAX_NAME_LENGTH + 2); + if (unilen < 0) { + exfat_msg(sb, KERN_ERR, + "failed to %s (err : %d) nls len : %d", + __func__, unilen, len); + return unilen; + } + + if (unilen > MAX_NAME_LENGTH) { + exfat_msg(sb, KERN_ERR, + "failed to %s (estr:ENAMETOOLONG) nls len : %d, unilen : %d > %d", + __func__, len, unilen, MAX_NAME_LENGTH); + return -ENAMETOOLONG; + } + + p_uniname->name_len = unilen & 0xFF; + + for (i = 0; i < unilen; i++) { + if (*uniname < 0x0020 || + exfat_wstrchr(bad_uni_chars, *uniname)) + lossy |= NLS_NAME_LOSSY; + + upname[i] = exfat_toupper(sb, *uniname); + uniname++; + } + + *uniname = '\0'; + p_uniname->name_len = unilen; + p_uniname->name_hash = exfat_calc_chksum_2byte(upname, unilen << 1, 0, + CS_DEFAULT); + + if (p_lossy) + *p_lossy = lossy; + return unilen; +} + +#define PLANE_SIZE 0x00010000 +#define SURROGATE_MASK 0xfffff800 +#define SURROGATE_PAIR 0x0000d800 +#define SURROGATE_LOW 0x00000400 +#define SURROGATE_BITS 0x000003ff + +unsigned short exfat_high_surrogate(unicode_t u) +{ + return ((u - PLANE_SIZE) >> 10) + SURROGATE_PAIR; +} + +unsigned short exfat_low_surrogate(unicode_t u) +{ + return ((u - PLANE_SIZE) & SURROGATE_BITS) | SURROGATE_PAIR | + SURROGATE_LOW; +} + +static int __exfat_utf16_to_nls(struct super_block *sb, + struct exfat_uni_name *p_uniname, unsigned char *p_cstring, + int buflen) +{ + int i, j, len, out_len = 0; + unsigned char buf[MAX_CHARSET_SIZE]; + const unsigned short *uniname = p_uniname->name; + struct nls_table *nls = EXFAT_SB(sb)->nls_io; + + i = 0; + while (i < MAX_NAME_LENGTH && out_len < (buflen - 1)) { + if (*uniname == '\0') + break; + if ((*uniname & SURROGATE_MASK) != SURROGATE_PAIR) { + len = exfat_convert_ucs2_to_char(nls, *uniname, buf, + NULL); + } else { + /* Process UTF-16 surrogate pair as one character */ + if (!(*uniname & SURROGATE_LOW) && + i+1 < MAX_NAME_LENGTH && + (*(uniname+1) & SURROGATE_MASK) == SURROGATE_PAIR && + (*(uniname+1) & SURROGATE_LOW)) { + uniname++; + i++; + } + + /* + * UTF-16 surrogate pair encodes code points above + * U+FFFF. Code points above U+FFFF are not supported + * by kernel NLS framework therefore use replacement + * character + */ + len = 1; + buf[0] = '_'; + } + + if (out_len + len >= buflen) + len = buflen - 1 - out_len; + out_len += len; + + if (len > 1) { + for (j = 0; j < len; j++) + *p_cstring++ = buf[j]; + } else { /* len == 1 */ + *p_cstring++ = *buf; + } + + uniname++; + i++; + } + + *p_cstring = '\0'; + return out_len; +} + +static int exfat_nls_to_ucs2(struct super_block *sb, + const unsigned char *p_cstring, const int len, + struct exfat_uni_name *p_uniname, int *p_lossy) +{ + int i = 0, unilen = 0, lossy = NLS_NAME_NO_LOSSY; + unsigned short upname[MAX_NAME_LENGTH + 1]; + unsigned short *uniname = p_uniname->name; + struct nls_table *nls = EXFAT_SB(sb)->nls_io; + + WARN_ON(!len); + + while (unilen < MAX_NAME_LENGTH && i < len) { + i += exfat_convert_char_to_ucs2(nls, p_cstring + i, len - i, + uniname, &lossy); + + if (*uniname < 0x0020 || + exfat_wstrchr(bad_uni_chars, *uniname)) + lossy |= NLS_NAME_LOSSY; + + upname[unilen] = exfat_toupper(sb, *uniname); + uniname++; + unilen++; + } + + if (p_cstring[i] != '\0') + lossy |= NLS_NAME_OVERLEN; + + *uniname = '\0'; + p_uniname->name_len = unilen; + p_uniname->name_hash = exfat_calc_chksum_2byte(upname, unilen << 1, 0, + CS_DEFAULT); + + if (p_lossy) + *p_lossy = lossy; + return unilen; +} + +int exfat_utf16_to_nls(struct super_block *sb, struct exfat_uni_name *uniname, + unsigned char *p_cstring, int buflen) +{ + if (EXFAT_SB(sb)->options.utf8) + return exfat_utf16_to_utf8(sb, uniname, p_cstring, + buflen); + return __exfat_utf16_to_nls(sb, uniname, p_cstring, buflen); +} + +int exfat_nls_to_utf16(struct super_block *sb, const unsigned char *p_cstring, + const int len, struct exfat_uni_name *uniname, int *p_lossy) +{ + if (EXFAT_SB(sb)->options.utf8) + return exfat_utf8_to_utf16(sb, p_cstring, len, + uniname, p_lossy); + return exfat_nls_to_ucs2(sb, p_cstring, len, uniname, p_lossy); +} + +static int exfat_load_upcase_table(struct super_block *sb, + sector_t sector, unsigned long long num_sectors, + unsigned int utbl_checksum) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + unsigned int sect_size = sb->s_blocksize; + unsigned int i, index = 0, checksum = 0; + int ret; + unsigned char skip = false; + unsigned short *upcase_table; + + upcase_table = kcalloc(UTBL_COUNT, sizeof(unsigned short), GFP_KERNEL); + if (!upcase_table) + return -ENOMEM; + + sbi->vol_utbl = upcase_table; + num_sectors += sector; + + while (sector < num_sectors) { + struct buffer_head *bh; + + bh = sb_bread(sb, sector); + if (!bh) { + exfat_msg(sb, KERN_ERR, + "failed to read sector(0x%llx)\n", + (unsigned long long)sector); + ret = -EIO; + goto free_table; + } + sector++; + for (i = 0; i < sect_size && index <= 0xFFFF; i += 2) { + unsigned short uni = get_unaligned_le16(bh->b_data + i); + + checksum = ((checksum & 1) ? 0x80000000 : 0) + + (checksum >> 1) + + *(((unsigned char *)bh->b_data) + i); + checksum = ((checksum & 1) ? 0x80000000 : 0) + + (checksum >> 1) + + *(((unsigned char *)bh->b_data) + (i + 1)); + + if (skip) { + index += uni; + skip = false; + } else if (uni == index) { + index++; + } else if (uni == 0xFFFF) { + skip = true; + } else { /* uni != index , uni != 0xFFFF */ + upcase_table[index] = uni; + index++; + } + } + brelse(bh); + } + + if (index >= 0xFFFF && utbl_checksum == checksum) + return 0; + + exfat_msg(sb, KERN_ERR, + "failed to load upcase table (idx : 0x%08x, chksum : 0x%08x, utbl_chksum : 0x%08x)\n", + index, checksum, utbl_checksum); + ret = -EINVAL; +free_table: + exfat_free_upcase_table(sbi); + return ret; +} + +static int exfat_load_default_upcase_table(struct super_block *sb) +{ + int i, ret = -EIO; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + unsigned char skip = false; + unsigned short uni = 0, *upcase_table; + unsigned int index = 0; + + upcase_table = kcalloc(UTBL_COUNT, sizeof(unsigned short), GFP_KERNEL); + if (!upcase_table) + return -ENOMEM; + + sbi->vol_utbl = upcase_table; + + for (i = 0; index <= 0xFFFF && i < EXFAT_NUM_UPCASE; i++) { + uni = uni_def_upcase[i]; + if (skip) { + index += uni; + skip = false; + } else if (uni == index) { + index++; + } else if (uni == 0xFFFF) { + skip = true; + } else { + upcase_table[index] = uni; + index++; + } + } + + if (index >= 0xFFFF) + return 0; + + /* FATAL error: default upcase table has error */ + exfat_free_upcase_table(sbi); + return ret; +} + +int exfat_create_upcase_table(struct super_block *sb) +{ + int i, ret; + unsigned int tbl_clu, type; + sector_t sector; + unsigned long long tbl_size, num_sectors; + unsigned char blksize_bits = sb->s_blocksize_bits; + struct exfat_chain clu; + struct exfat_dentry *ep; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct buffer_head *bh; + + clu.dir = sbi->root_dir; + clu.flags = ALLOC_FAT_CHAIN; + + while (clu.dir != EXFAT_EOF_CLUSTER) { + for (i = 0; i < sbi->dentries_per_clu; i++) { + ep = exfat_get_dentry(sb, &clu, i, &bh, NULL); + if (!ep) + return -EIO; + + type = exfat_get_entry_type(ep); + if (type == TYPE_UNUSED) { + brelse(bh); + break; + } + + if (type != TYPE_UPCASE) { + brelse(bh); + continue; + } + + tbl_clu = le32_to_cpu(ep->dentry.upcase.start_clu); + tbl_size = le64_to_cpu(ep->dentry.upcase.size); + + sector = exfat_cluster_to_sector(sbi, tbl_clu); + num_sectors = ((tbl_size - 1) >> blksize_bits) + 1; + ret = exfat_load_upcase_table(sb, sector, num_sectors, + le32_to_cpu(ep->dentry.upcase.checksum)); + + brelse(bh); + if (ret && ret != -EIO) + goto load_default; + + /* load successfully */ + return ret; + } + + if (exfat_get_next_cluster(sb, &(clu.dir))) + return -EIO; + } + +load_default: + /* load default upcase table */ + return exfat_load_default_upcase_table(sb); +} + +void exfat_free_upcase_table(struct exfat_sb_info *sbi) +{ + kfree(sbi->vol_utbl); +} diff --git a/fs/exfat/super.c b/fs/exfat/super.c new file mode 100644 index 000000000000..16ed202ef527 --- /dev/null +++ b/fs/exfat/super.c @@ -0,0 +1,722 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2012-2013 Samsung Electronics Co., Ltd. + */ + +#include <linux/fs_context.h> +#include <linux/fs_parser.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/time.h> +#include <linux/mount.h> +#include <linux/cred.h> +#include <linux/statfs.h> +#include <linux/seq_file.h> +#include <linux/blkdev.h> +#include <linux/fs_struct.h> +#include <linux/iversion.h> +#include <linux/nls.h> +#include <linux/buffer_head.h> + +#include "exfat_raw.h" +#include "exfat_fs.h" + +static char exfat_default_iocharset[] = CONFIG_EXFAT_DEFAULT_IOCHARSET; +static struct kmem_cache *exfat_inode_cachep; + +static void exfat_free_iocharset(struct exfat_sb_info *sbi) +{ + if (sbi->options.iocharset != exfat_default_iocharset) + kfree(sbi->options.iocharset); +} + +static void exfat_delayed_free(struct rcu_head *p) +{ + struct exfat_sb_info *sbi = container_of(p, struct exfat_sb_info, rcu); + + unload_nls(sbi->nls_io); + exfat_free_iocharset(sbi); + exfat_free_upcase_table(sbi); + kfree(sbi); +} + +static void exfat_put_super(struct super_block *sb) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + mutex_lock(&sbi->s_lock); + if (test_and_clear_bit(EXFAT_SB_DIRTY, &sbi->s_state)) + sync_blockdev(sb->s_bdev); + exfat_set_vol_flags(sb, VOL_CLEAN); + exfat_free_bitmap(sbi); + mutex_unlock(&sbi->s_lock); + + call_rcu(&sbi->rcu, exfat_delayed_free); +} + +static int exfat_sync_fs(struct super_block *sb, int wait) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + int err = 0; + + /* If there are some dirty buffers in the bdev inode */ + mutex_lock(&sbi->s_lock); + if (test_and_clear_bit(EXFAT_SB_DIRTY, &sbi->s_state)) { + sync_blockdev(sb->s_bdev); + if (exfat_set_vol_flags(sb, VOL_CLEAN)) + err = -EIO; + } + mutex_unlock(&sbi->s_lock); + return err; +} + +static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + unsigned long long id = huge_encode_dev(sb->s_bdev->bd_dev); + + if (sbi->used_clusters == EXFAT_CLUSTERS_UNTRACKED) { + mutex_lock(&sbi->s_lock); + if (exfat_count_used_clusters(sb, &sbi->used_clusters)) { + mutex_unlock(&sbi->s_lock); + return -EIO; + } + mutex_unlock(&sbi->s_lock); + } + + buf->f_type = sb->s_magic; + buf->f_bsize = sbi->cluster_size; + buf->f_blocks = sbi->num_clusters - 2; /* clu 0 & 1 */ + buf->f_bfree = buf->f_blocks - sbi->used_clusters; + buf->f_bavail = buf->f_bfree; + buf->f_fsid.val[0] = (unsigned int)id; + buf->f_fsid.val[1] = (unsigned int)(id >> 32); + /* Unicode utf16 255 characters */ + buf->f_namelen = EXFAT_MAX_FILE_LEN * NLS_MAX_CHARSET_SIZE; + return 0; +} + +int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct pbr64 *bpb; + bool sync = 0; + + /* flags are not changed */ + if (sbi->vol_flag == new_flag) + return 0; + + sbi->vol_flag = new_flag; + + /* skip updating volume dirty flag, + * if this volume has been mounted with read-only + */ + if (sb_rdonly(sb)) + return 0; + + if (!sbi->pbr_bh) { + sbi->pbr_bh = sb_bread(sb, 0); + if (!sbi->pbr_bh) { + exfat_msg(sb, KERN_ERR, "failed to read boot sector"); + return -ENOMEM; + } + } + + bpb = (struct pbr64 *)sbi->pbr_bh->b_data; + bpb->bsx.vol_flags = cpu_to_le16(new_flag); + + if (new_flag == VOL_DIRTY && !buffer_dirty(sbi->pbr_bh)) + sync = true; + else + sync = false; + + set_buffer_uptodate(sbi->pbr_bh); + mark_buffer_dirty(sbi->pbr_bh); + + if (sync) + sync_dirty_buffer(sbi->pbr_bh); + return 0; +} + +static int exfat_show_options(struct seq_file *m, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_mount_options *opts = &sbi->options; + + /* Show partition info */ + if (!uid_eq(opts->fs_uid, GLOBAL_ROOT_UID)) + seq_printf(m, ",uid=%u", + from_kuid_munged(&init_user_ns, opts->fs_uid)); + if (!gid_eq(opts->fs_gid, GLOBAL_ROOT_GID)) + seq_printf(m, ",gid=%u", + from_kgid_munged(&init_user_ns, opts->fs_gid)); + seq_printf(m, ",fmask=%04o,dmask=%04o", opts->fs_fmask, opts->fs_dmask); + if (opts->allow_utime) + seq_printf(m, ",allow_utime=%04o", opts->allow_utime); + if (opts->utf8) + seq_puts(m, ",iocharset=utf8"); + else if (sbi->nls_io) + seq_printf(m, ",iocharset=%s", sbi->nls_io->charset); + seq_printf(m, ",bps=%ld", sb->s_blocksize); + if (opts->errors == EXFAT_ERRORS_CONT) + seq_puts(m, ",errors=continue"); + else if (opts->errors == EXFAT_ERRORS_PANIC) + seq_puts(m, ",errors=panic"); + else + seq_puts(m, ",errors=remount-ro"); + if (opts->discard) + seq_puts(m, ",discard"); + if (opts->time_offset) + seq_printf(m, ",time_offset=%d", opts->time_offset); + return 0; +} + +static struct inode *exfat_alloc_inode(struct super_block *sb) +{ + struct exfat_inode_info *ei; + + ei = kmem_cache_alloc(exfat_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + + init_rwsem(&ei->truncate_lock); + return &ei->vfs_inode; +} + +static void exfat_free_inode(struct inode *inode) +{ + kmem_cache_free(exfat_inode_cachep, EXFAT_I(inode)); +} + +static const struct super_operations exfat_sops = { + .alloc_inode = exfat_alloc_inode, + .free_inode = exfat_free_inode, + .write_inode = exfat_write_inode, + .evict_inode = exfat_evict_inode, + .put_super = exfat_put_super, + .sync_fs = exfat_sync_fs, + .statfs = exfat_statfs, + .show_options = exfat_show_options, +}; + +enum { + Opt_uid, + Opt_gid, + Opt_umask, + Opt_dmask, + Opt_fmask, + Opt_allow_utime, + Opt_charset, + Opt_errors, + Opt_discard, + Opt_time_offset, +}; + +static const struct constant_table exfat_param_enums[] = { + { "continue", EXFAT_ERRORS_CONT }, + { "panic", EXFAT_ERRORS_PANIC }, + { "remount-ro", EXFAT_ERRORS_RO }, + {} +}; + +static const struct fs_parameter_spec exfat_parameters[] = { + fsparam_u32("uid", Opt_uid), + fsparam_u32("gid", Opt_gid), + fsparam_u32oct("umask", Opt_umask), + fsparam_u32oct("dmask", Opt_dmask), + fsparam_u32oct("fmask", Opt_fmask), + fsparam_u32oct("allow_utime", Opt_allow_utime), + fsparam_string("iocharset", Opt_charset), + fsparam_enum("errors", Opt_errors, exfat_param_enums), + fsparam_flag("discard", Opt_discard), + fsparam_s32("time_offset", Opt_time_offset), + {} +}; + +static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct exfat_sb_info *sbi = fc->s_fs_info; + struct exfat_mount_options *opts = &sbi->options; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, exfat_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + opts->fs_uid = make_kuid(current_user_ns(), result.uint_32); + break; + case Opt_gid: + opts->fs_gid = make_kgid(current_user_ns(), result.uint_32); + break; + case Opt_umask: + opts->fs_fmask = result.uint_32; + opts->fs_dmask = result.uint_32; + break; + case Opt_dmask: + opts->fs_dmask = result.uint_32; + break; + case Opt_fmask: + opts->fs_fmask = result.uint_32; + break; + case Opt_allow_utime: + opts->allow_utime = result.uint_32 & 0022; + break; + case Opt_charset: + exfat_free_iocharset(sbi); + opts->iocharset = kstrdup(param->string, GFP_KERNEL); + if (!opts->iocharset) + return -ENOMEM; + break; + case Opt_errors: + opts->errors = result.uint_32; + break; + case Opt_discard: + opts->discard = 1; + break; + case Opt_time_offset: + /* + * Make the limit 24 just in case someone invents something + * unusual. + */ + if (result.int_32 < -24 * 60 || result.int_32 > 24 * 60) + return -EINVAL; + opts->time_offset = result.int_32; + break; + default: + return -EINVAL; + } + + return 0; +} + +static void exfat_hash_init(struct super_block *sb) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + int i; + + spin_lock_init(&sbi->inode_hash_lock); + for (i = 0; i < EXFAT_HASH_SIZE; i++) + INIT_HLIST_HEAD(&sbi->inode_hashtable[i]); +} + +static int exfat_read_root(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_inode_info *ei = EXFAT_I(inode); + struct exfat_chain cdir; + int num_subdirs, num_clu = 0; + + exfat_chain_set(&ei->dir, sbi->root_dir, 0, ALLOC_FAT_CHAIN); + ei->entry = -1; + ei->start_clu = sbi->root_dir; + ei->flags = ALLOC_FAT_CHAIN; + ei->type = TYPE_DIR; + ei->version = 0; + ei->rwoffset = 0; + ei->hint_bmap.off = EXFAT_EOF_CLUSTER; + ei->hint_stat.eidx = 0; + ei->hint_stat.clu = sbi->root_dir; + ei->hint_femp.eidx = EXFAT_HINT_NONE; + + exfat_chain_set(&cdir, sbi->root_dir, 0, ALLOC_FAT_CHAIN); + if (exfat_count_num_clusters(sb, &cdir, &num_clu)) + return -EIO; + i_size_write(inode, num_clu << sbi->cluster_size_bits); + + num_subdirs = exfat_count_dir_entries(sb, &cdir); + if (num_subdirs < 0) + return -EIO; + set_nlink(inode, num_subdirs + EXFAT_MIN_SUBDIR); + + inode->i_uid = sbi->options.fs_uid; + inode->i_gid = sbi->options.fs_gid; + inode_inc_iversion(inode); + inode->i_generation = 0; + inode->i_mode = exfat_make_mode(sbi, ATTR_SUBDIR, 0777); + inode->i_op = &exfat_dir_inode_operations; + inode->i_fop = &exfat_dir_operations; + + inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) + & ~(sbi->cluster_size - 1)) >> inode->i_blkbits; + EXFAT_I(inode)->i_pos = ((loff_t)sbi->root_dir << 32) | 0xffffffff; + EXFAT_I(inode)->i_size_aligned = i_size_read(inode); + EXFAT_I(inode)->i_size_ondisk = i_size_read(inode); + + exfat_save_attr(inode, ATTR_SUBDIR); + inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = + current_time(inode); + exfat_cache_init_inode(inode); + return 0; +} + +static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb, + struct buffer_head **prev_bh) +{ + struct pbr *p_pbr = (struct pbr *) (*prev_bh)->b_data; + unsigned short logical_sect = 0; + + logical_sect = 1 << p_pbr->bsx.f64.sect_size_bits; + + if (!is_power_of_2(logical_sect) || + logical_sect < 512 || logical_sect > 4096) { + exfat_msg(sb, KERN_ERR, "bogus logical sector size %u", + logical_sect); + return NULL; + } + + if (logical_sect < sb->s_blocksize) { + exfat_msg(sb, KERN_ERR, + "logical sector size too small for device (logical sector size = %u)", + logical_sect); + return NULL; + } + + if (logical_sect > sb->s_blocksize) { + struct buffer_head *bh = NULL; + + __brelse(*prev_bh); + *prev_bh = NULL; + + if (!sb_set_blocksize(sb, logical_sect)) { + exfat_msg(sb, KERN_ERR, + "unable to set blocksize %u", logical_sect); + return NULL; + } + bh = sb_bread(sb, 0); + if (!bh) { + exfat_msg(sb, KERN_ERR, + "unable to read boot sector (logical sector size = %lu)", + sb->s_blocksize); + return NULL; + } + + *prev_bh = bh; + p_pbr = (struct pbr *) bh->b_data; + } + return p_pbr; +} + +/* mount the file system volume */ +static int __exfat_fill_super(struct super_block *sb) +{ + int ret; + struct pbr *p_pbr; + struct pbr64 *p_bpb; + struct buffer_head *bh; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + /* set block size to read super block */ + sb_min_blocksize(sb, 512); + + /* read boot sector */ + bh = sb_bread(sb, 0); + if (!bh) { + exfat_msg(sb, KERN_ERR, "unable to read boot sector"); + return -EIO; + } + + /* PRB is read */ + p_pbr = (struct pbr *)bh->b_data; + + /* check the validity of PBR */ + if (le16_to_cpu((p_pbr->signature)) != PBR_SIGNATURE) { + exfat_msg(sb, KERN_ERR, "invalid boot record signature"); + ret = -EINVAL; + goto free_bh; + } + + + /* check logical sector size */ + p_pbr = exfat_read_pbr_with_logical_sector(sb, &bh); + if (!p_pbr) { + ret = -EIO; + goto free_bh; + } + + /* + * res_zero field must be filled with zero to prevent mounting + * from FAT volume. + */ + if (memchr_inv(p_pbr->bpb.f64.res_zero, 0, + sizeof(p_pbr->bpb.f64.res_zero))) { + ret = -EINVAL; + goto free_bh; + } + + p_bpb = (struct pbr64 *)p_pbr; + if (!p_bpb->bsx.num_fats) { + exfat_msg(sb, KERN_ERR, "bogus number of FAT structure"); + ret = -EINVAL; + goto free_bh; + } + + sbi->sect_per_clus = 1 << p_bpb->bsx.sect_per_clus_bits; + sbi->sect_per_clus_bits = p_bpb->bsx.sect_per_clus_bits; + sbi->cluster_size_bits = sbi->sect_per_clus_bits + sb->s_blocksize_bits; + sbi->cluster_size = 1 << sbi->cluster_size_bits; + sbi->num_FAT_sectors = le32_to_cpu(p_bpb->bsx.fat_length); + sbi->FAT1_start_sector = le32_to_cpu(p_bpb->bsx.fat_offset); + sbi->FAT2_start_sector = p_bpb->bsx.num_fats == 1 ? + sbi->FAT1_start_sector : + sbi->FAT1_start_sector + sbi->num_FAT_sectors; + sbi->data_start_sector = le32_to_cpu(p_bpb->bsx.clu_offset); + sbi->num_sectors = le64_to_cpu(p_bpb->bsx.vol_length); + /* because the cluster index starts with 2 */ + sbi->num_clusters = le32_to_cpu(p_bpb->bsx.clu_count) + + EXFAT_RESERVED_CLUSTERS; + + sbi->root_dir = le32_to_cpu(p_bpb->bsx.root_cluster); + sbi->dentries_per_clu = 1 << + (sbi->cluster_size_bits - DENTRY_SIZE_BITS); + + sbi->vol_flag = le16_to_cpu(p_bpb->bsx.vol_flags); + sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER; + sbi->used_clusters = EXFAT_CLUSTERS_UNTRACKED; + + if (le16_to_cpu(p_bpb->bsx.vol_flags) & VOL_DIRTY) { + sbi->vol_flag |= VOL_DIRTY; + exfat_msg(sb, KERN_WARNING, + "Volume was not properly unmounted. Some data may be corrupt. Please run fsck."); + } + + /* exFAT file size is limited by a disk volume size */ + sb->s_maxbytes = (u64)(sbi->num_clusters - EXFAT_RESERVED_CLUSTERS) << + sbi->cluster_size_bits; + + ret = exfat_create_upcase_table(sb); + if (ret) { + exfat_msg(sb, KERN_ERR, "failed to load upcase table"); + goto free_bh; + } + + ret = exfat_load_bitmap(sb); + if (ret) { + exfat_msg(sb, KERN_ERR, "failed to load alloc-bitmap"); + goto free_upcase_table; + } + + ret = exfat_count_used_clusters(sb, &sbi->used_clusters); + if (ret) { + exfat_msg(sb, KERN_ERR, "failed to scan clusters"); + goto free_alloc_bitmap; + } + + return 0; + +free_alloc_bitmap: + exfat_free_bitmap(sbi); +free_upcase_table: + exfat_free_upcase_table(sbi); +free_bh: + brelse(bh); + return ret; +} + +static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) +{ + struct exfat_sb_info *sbi = sb->s_fs_info; + struct exfat_mount_options *opts = &sbi->options; + struct inode *root_inode; + int err; + + if (opts->allow_utime == (unsigned short)-1) + opts->allow_utime = ~opts->fs_dmask & 0022; + + if (opts->discard) { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + + if (!blk_queue_discard(q)) + exfat_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but the device does not support discard"); + opts->discard = 0; + } + + sb->s_flags |= SB_NODIRATIME; + sb->s_magic = EXFAT_SUPER_MAGIC; + sb->s_op = &exfat_sops; + + sb->s_time_gran = 1; + sb->s_time_min = EXFAT_MIN_TIMESTAMP_SECS; + sb->s_time_max = EXFAT_MAX_TIMESTAMP_SECS; + + err = __exfat_fill_super(sb); + if (err) { + exfat_msg(sb, KERN_ERR, "failed to recognize exfat type"); + goto check_nls_io; + } + + /* set up enough so that it can read an inode */ + exfat_hash_init(sb); + + if (!strcmp(sbi->options.iocharset, "utf8")) + opts->utf8 = 1; + else { + sbi->nls_io = load_nls(sbi->options.iocharset); + if (!sbi->nls_io) { + exfat_msg(sb, KERN_ERR, "IO charset %s not found", + sbi->options.iocharset); + err = -EINVAL; + goto free_table; + } + } + + if (sbi->options.utf8) + sb->s_d_op = &exfat_utf8_dentry_ops; + else + sb->s_d_op = &exfat_dentry_ops; + + root_inode = new_inode(sb); + if (!root_inode) { + exfat_msg(sb, KERN_ERR, "failed to allocate root inode."); + err = -ENOMEM; + goto free_table; + } + + root_inode->i_ino = EXFAT_ROOT_INO; + inode_set_iversion(root_inode, 1); + err = exfat_read_root(root_inode); + if (err) { + exfat_msg(sb, KERN_ERR, "failed to initialize root inode."); + goto put_inode; + } + + exfat_hash_inode(root_inode, EXFAT_I(root_inode)->i_pos); + insert_inode_hash(root_inode); + + sb->s_root = d_make_root(root_inode); + if (!sb->s_root) { + exfat_msg(sb, KERN_ERR, "failed to get the root dentry"); + err = -ENOMEM; + goto put_inode; + } + + return 0; + +put_inode: + iput(root_inode); + sb->s_root = NULL; + +free_table: + exfat_free_upcase_table(sbi); + exfat_free_bitmap(sbi); + +check_nls_io: + unload_nls(sbi->nls_io); + exfat_free_iocharset(sbi); + sb->s_fs_info = NULL; + kfree(sbi); + return err; +} + +static int exfat_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, exfat_fill_super); +} + +static void exfat_free(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations exfat_context_ops = { + .parse_param = exfat_parse_param, + .get_tree = exfat_get_tree, + .free = exfat_free, +}; + +static int exfat_init_fs_context(struct fs_context *fc) +{ + struct exfat_sb_info *sbi; + + sbi = kzalloc(sizeof(struct exfat_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + mutex_init(&sbi->s_lock); + ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + sbi->options.fs_uid = current_uid(); + sbi->options.fs_gid = current_gid(); + sbi->options.fs_fmask = current->fs->umask; + sbi->options.fs_dmask = current->fs->umask; + sbi->options.allow_utime = -1; + sbi->options.iocharset = exfat_default_iocharset; + sbi->options.errors = EXFAT_ERRORS_RO; + + fc->s_fs_info = sbi; + fc->ops = &exfat_context_ops; + return 0; +} + +static struct file_system_type exfat_fs_type = { + .owner = THIS_MODULE, + .name = "exfat", + .init_fs_context = exfat_init_fs_context, + .parameters = exfat_parameters, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +static void exfat_inode_init_once(void *foo) +{ + struct exfat_inode_info *ei = (struct exfat_inode_info *)foo; + + INIT_HLIST_NODE(&ei->i_hash_fat); + inode_init_once(&ei->vfs_inode); +} + +static int __init init_exfat_fs(void) +{ + int err; + + err = exfat_cache_init(); + if (err) + return err; + + exfat_inode_cachep = kmem_cache_create("exfat_inode_cache", + sizeof(struct exfat_inode_info), + 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + exfat_inode_init_once); + if (!exfat_inode_cachep) { + err = -ENOMEM; + goto shutdown_cache; + } + + err = register_filesystem(&exfat_fs_type); + if (err) + goto destroy_cache; + + return 0; + +destroy_cache: + kmem_cache_destroy(exfat_inode_cachep); +shutdown_cache: + exfat_cache_shutdown(); + return err; +} + +static void __exit exit_exfat_fs(void) +{ + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(exfat_inode_cachep); + unregister_filesystem(&exfat_fs_type); + exfat_cache_shutdown(); +} + +module_init(init_exfat_fs); +module_exit(exit_exfat_fs); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("exFAT filesystem support"); +MODULE_AUTHOR("Samsung Electronics Co., Ltd."); diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 0456bc990b5e..943cc469f42f 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -56,6 +56,7 @@ #include <linux/buffer_head.h> #include <linux/init.h> +#include <linux/printk.h> #include <linux/slab.h> #include <linux/mbcache.h> #include <linux/quotaops.h> @@ -84,8 +85,8 @@ printk("\n"); \ } while (0) #else -# define ea_idebug(f...) -# define ea_bdebug(f...) +# define ea_idebug(inode, f...) no_printk(f) +# define ea_bdebug(bh, f...) no_printk(f) #endif static int ext2_xattr_set2(struct inode *, struct buffer_head *, @@ -790,7 +791,15 @@ ext2_xattr_delete_inode(struct inode *inode) struct buffer_head *bh = NULL; struct ext2_sb_info *sbi = EXT2_SB(inode->i_sb); - down_write(&EXT2_I(inode)->xattr_sem); + /* + * We are the only ones holding inode reference. The xattr_sem should + * better be unlocked! We could as well just not acquire xattr_sem at + * all but this makes the code more futureproof. OTOH we need trylock + * here to avoid false-positive warning from lockdep about reclaim + * circular dependency. + */ + if (WARN_ON_ONCE(!down_write_trylock(&EXT2_I(inode)->xattr_sem))) + return; if (!EXT2_I(inode)->i_file_acl) goto cleanup; @@ -864,8 +873,7 @@ ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh) true); if (error) { if (error == -EBUSY) { - ea_bdebug(bh, "already in cache (%d cache entries)", - atomic_read(&ext2_xattr_cache->c_entry_count)); + ea_bdebug(bh, "already in cache"); error = 0; } } else diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h index cee888cdc235..16272e6ddcf4 100644 --- a/fs/ext2/xattr.h +++ b/fs/ext2/xattr.h @@ -39,7 +39,7 @@ struct ext2_xattr_entry { __le32 e_value_block; /* disk block attribute is stored on (n/i) */ __le32 e_value_size; /* size of attribute value */ __le32 e_hash; /* hash value of name and value */ - char e_name[0]; /* attribute name */ + char e_name[]; /* attribute name */ }; #define EXT2_XATTR_PAD_BITS 2 diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 8fd0b3cdab4c..0e0a4d6209c7 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -516,10 +516,9 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, wait_on_buffer(bh); ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO); if (!buffer_uptodate(bh)) { - ext4_set_errno(sb, EIO); - ext4_error(sb, "Cannot read block bitmap - " - "block_group = %u, block_bitmap = %llu", - block_group, (unsigned long long) bh->b_blocknr); + ext4_error_err(sb, EIO, "Cannot read block bitmap - " + "block_group = %u, block_bitmap = %llu", + block_group, (unsigned long long) bh->b_blocknr); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EIO; diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 0a734ffb4310..16e9b2fda03a 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -166,10 +166,8 @@ static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi, if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || (start_blk + count < start_blk) || - (start_blk + count > ext4_blocks_count(sbi->s_es))) { - sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + (start_blk + count > ext4_blocks_count(sbi->s_es))) return 0; - } if (system_blks == NULL) return 1; @@ -181,10 +179,8 @@ static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi, n = n->rb_left; else if (start_blk >= (entry->start_blk + entry->count)) n = n->rb_right; - else { - sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + else return 0; - } } return 1; } @@ -220,10 +216,12 @@ static int ext4_protect_reserved_inode(struct super_block *sb, } else { if (!ext4_data_block_valid_rcu(sbi, system_blks, map.m_pblk, n)) { - ext4_error(sb, "blocks %llu-%llu from inode %u " - "overlap system zone", map.m_pblk, - map.m_pblk + map.m_len - 1, ino); err = -EFSCORRUPTED; + __ext4_error(sb, __func__, __LINE__, -err, + map.m_pblk, "blocks %llu-%llu " + "from inode %u overlap system zone", + map.m_pblk, + map.m_pblk + map.m_len - 1, ino); break; } err = add_system_zone(system_blks, map.m_pblk, n); @@ -365,7 +363,6 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, int ext4_check_blockref(const char *function, unsigned int line, struct inode *inode, __le32 *p, unsigned int max) { - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; __le32 *bref = p; unsigned int blk; @@ -379,7 +376,6 @@ int ext4_check_blockref(const char *function, unsigned int line, if (blk && unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), blk, 1))) { - es->s_last_error_block = cpu_to_le64(blk); ext4_error_inode(inode, function, line, blk, "invalid block"); return -EFSCORRUPTED; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 9aa1f75409b0..c654205f648d 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -392,7 +392,7 @@ struct fname { __u32 inode; __u8 name_len; __u8 file_type; - char name[0]; + char name[]; }; /* diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 61b37a052052..91eb4381cae5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -414,7 +414,7 @@ struct flex_groups { #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ -#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +/* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */ #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */ @@ -487,7 +487,7 @@ enum { EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ EXT4_INODE_VERITY = 20, /* Verity protected inode */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ - EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ +/* 22 was formerly EXT4_INODE_EOFBLOCKS */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ @@ -533,7 +533,6 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(EXTENTS); CHECK_FLAG_VALUE(VERITY); CHECK_FLAG_VALUE(EA_INODE); - CHECK_FLAG_VALUE(EOFBLOCKS); CHECK_FLAG_VALUE(INLINE_DATA); CHECK_FLAG_VALUE(PROJINHERIT); CHECK_FLAG_VALUE(RESERVED); @@ -2771,21 +2770,20 @@ extern const char *ext4_decode_error(struct super_block *sb, int errno, extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t block_group, unsigned int flags); -extern void ext4_set_errno(struct super_block *sb, int err); -extern __printf(4, 5) -void __ext4_error(struct super_block *, const char *, unsigned int, +extern __printf(6, 7) +void __ext4_error(struct super_block *, const char *, unsigned int, int, __u64, const char *, ...); -extern __printf(5, 6) -void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, - const char *, ...); +extern __printf(6, 7) +void __ext4_error_inode(struct inode *, const char *, unsigned int, + ext4_fsblk_t, int, const char *, ...); extern __printf(5, 6) void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, const char *, ...); extern void __ext4_std_error(struct super_block *, const char *, unsigned int, int); -extern __printf(4, 5) -void __ext4_abort(struct super_block *, const char *, unsigned int, +extern __printf(5, 6) +void __ext4_abort(struct super_block *, const char *, unsigned int, int, const char *, ...); extern __printf(4, 5) void __ext4_warning(struct super_block *, const char *, unsigned int, @@ -2806,8 +2804,12 @@ void __ext4_grp_locked_error(const char *, unsigned int, #define EXT4_ERROR_INODE(inode, fmt, a...) \ ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) -#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ - ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) +#define EXT4_ERROR_INODE_ERR(inode, err, fmt, a...) \ + __ext4_error_inode((inode), __func__, __LINE__, 0, (err), (fmt), ## a) + +#define ext4_error_inode_block(inode, block, err, fmt, a...) \ + __ext4_error_inode((inode), __func__, __LINE__, (block), (err), \ + (fmt), ## a) #define EXT4_ERROR_FILE(file, block, fmt, a...) \ ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) @@ -2815,13 +2817,18 @@ void __ext4_grp_locked_error(const char *, unsigned int, #ifdef CONFIG_PRINTK #define ext4_error_inode(inode, func, line, block, fmt, ...) \ - __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__) + __ext4_error_inode(inode, func, line, block, 0, fmt, ##__VA_ARGS__) +#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \ + __ext4_error_inode((inode), (func), (line), (block), \ + (err), (fmt), ##__VA_ARGS__) #define ext4_error_file(file, func, line, block, fmt, ...) \ __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) #define ext4_error(sb, fmt, ...) \ - __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) -#define ext4_abort(sb, fmt, ...) \ - __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) + __ext4_error((sb), __func__, __LINE__, 0, 0, (fmt), ##__VA_ARGS__) +#define ext4_error_err(sb, err, fmt, ...) \ + __ext4_error((sb), __func__, __LINE__, (err), 0, (fmt), ##__VA_ARGS__) +#define ext4_abort(sb, err, fmt, ...) \ + __ext4_abort((sb), __func__, __LINE__, (err), (fmt), ##__VA_ARGS__) #define ext4_warning(sb, fmt, ...) \ __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) #define ext4_warning_inode(inode, fmt, ...) \ @@ -2839,7 +2846,12 @@ void __ext4_grp_locked_error(const char *, unsigned int, #define ext4_error_inode(inode, func, line, block, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ - __ext4_error_inode(inode, "", 0, block, " "); \ + __ext4_error_inode(inode, "", 0, block, 0, " "); \ +} while (0) +#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error_inode(inode, "", 0, block, err, " "); \ } while (0) #define ext4_error_file(file, func, line, block, fmt, ...) \ do { \ @@ -2849,12 +2861,17 @@ do { \ #define ext4_error(sb, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ - __ext4_error(sb, "", 0, " "); \ + __ext4_error(sb, "", 0, 0, 0, " "); \ +} while (0) +#define ext4_error_err(sb, err, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_error(sb, "", 0, err, 0, " "); \ } while (0) -#define ext4_abort(sb, fmt, ...) \ +#define ext4_abort(sb, err, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ - __ext4_abort(sb, "", 0, " "); \ + __ext4_abort(sb, "", 0, err, " "); \ } while (0) #define ext4_warning(sb, fmt, ...) \ do { \ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 1f53d64e42a5..7f16e1af8d5c 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -80,8 +80,7 @@ static int ext4_journal_check_start(struct super_block *sb) * take the FS itself readonly cleanly. */ if (journal && is_journal_aborted(journal)) { - ext4_set_errno(sb, -journal->j_errno); - ext4_abort(sb, "Detected aborted journal"); + ext4_abort(sb, -journal->j_errno, "Detected aborted journal"); return -EROFS; } return 0; @@ -272,8 +271,7 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle, if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); - ext4_set_errno(inode->i_sb, -err); - __ext4_abort(inode->i_sb, where, line, + __ext4_abort(inode->i_sb, where, line, -err, "error %d when attempting revoke", err); } BUFFER_TRACE(bh, "exit"); @@ -332,6 +330,7 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, err); } } else { + set_buffer_uptodate(bh); if (inode) mark_buffer_dirty_inode(bh, inode); else @@ -342,11 +341,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, struct ext4_super_block *es; es = EXT4_SB(inode->i_sb)->s_es; - es->s_last_error_block = - cpu_to_le64(bh->b_blocknr); - ext4_set_errno(inode->i_sb, EIO); - ext4_error_inode(inode, where, line, - bh->b_blocknr, + ext4_error_inode_err(inode, where, line, + bh->b_blocknr, EIO, "IO error syncing itable block"); err = -EIO; } diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 7ea4f6fa173b..4b9002f0e84c 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -512,6 +512,9 @@ static inline int ext4_should_dioread_nolock(struct inode *inode) return 0; if (ext4_should_journal_data(inode)) return 0; + /* temporary fix to prevent generic/422 test failures */ + if (!test_opt(inode->i_sb, DELALLOC)) + return 0; return 1; } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 954013d6076b..031752cfb6f7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -28,6 +28,7 @@ #include <linux/uaccess.h> #include <linux/fiemap.h> #include <linux/backing-dev.h> +#include <linux/iomap.h> #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" @@ -83,13 +84,6 @@ static void ext4_extent_block_csum_set(struct inode *inode, et->et_checksum = ext4_extent_block_csum(inode, eh); } -static int ext4_split_extent(handle_t *handle, - struct inode *inode, - struct ext4_ext_path **ppath, - struct ext4_map_blocks *map, - int split_flag, - int flags); - static int ext4_split_extent_at(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, @@ -97,9 +91,6 @@ static int ext4_split_extent_at(handle_t *handle, int split_flag, int flags); -static int ext4_find_delayed_extent(struct inode *inode, - struct extent_status *newes); - static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped) { /* @@ -358,8 +349,8 @@ static int ext4_valid_extent_idx(struct inode *inode, } static int ext4_valid_extent_entries(struct inode *inode, - struct ext4_extent_header *eh, - int depth) + struct ext4_extent_header *eh, + ext4_fsblk_t *pblk, int depth) { unsigned short entries; if (eh->eh_entries == 0) @@ -370,8 +361,6 @@ static int ext4_valid_extent_entries(struct inode *inode, if (depth == 0) { /* leaf entries */ struct ext4_extent *ext = EXT_FIRST_EXTENT(eh); - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; - ext4_fsblk_t pblock = 0; ext4_lblk_t lblock = 0; ext4_lblk_t prev = 0; int len = 0; @@ -383,8 +372,7 @@ static int ext4_valid_extent_entries(struct inode *inode, lblock = le32_to_cpu(ext->ee_block); len = ext4_ext_get_actual_len(ext); if ((lblock <= prev) && prev) { - pblock = ext4_ext_pblock(ext); - es->s_last_error_block = cpu_to_le64(pblock); + *pblk = ext4_ext_pblock(ext); return 0; } ext++; @@ -431,7 +419,7 @@ static int __ext4_ext_check(const char *function, unsigned int line, error_msg = "invalid eh_entries"; goto corrupted; } - if (!ext4_valid_extent_entries(inode, eh, depth)) { + if (!ext4_valid_extent_entries(inode, eh, &pblk, depth)) { error_msg = "invalid extent entries"; goto corrupted; } @@ -449,14 +437,14 @@ static int __ext4_ext_check(const char *function, unsigned int line, return 0; corrupted: - ext4_set_errno(inode->i_sb, -err); - ext4_error_inode(inode, function, line, 0, - "pblk %llu bad header/extent: %s - magic %x, " - "entries %u, max %u(%u), depth %u(%u)", - (unsigned long long) pblk, error_msg, - le16_to_cpu(eh->eh_magic), - le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), - max, le16_to_cpu(eh->eh_depth), depth); + ext4_error_inode_err(inode, function, line, 0, -err, + "pblk %llu bad header/extent: %s - magic %x, " + "entries %u, max %u(%u), depth %u(%u)", + (unsigned long long) pblk, error_msg, + le16_to_cpu(eh->eh_magic), + le16_to_cpu(eh->eh_entries), + le16_to_cpu(eh->eh_max), + max, le16_to_cpu(eh->eh_depth), depth); return err; } @@ -556,6 +544,12 @@ int ext4_ext_precache(struct inode *inode) down_read(&ei->i_data_sem); depth = ext_depth(inode); + /* Don't cache anything if there are no external extent blocks */ + if (!depth) { + up_read(&ei->i_data_sem); + return ret; + } + path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), GFP_NOFS); if (path == NULL) { @@ -563,9 +557,6 @@ int ext4_ext_precache(struct inode *inode) return -ENOMEM; } - /* Don't cache anything if there are no external extent blocks */ - if (depth == 0) - goto out; path[0].p_hdr = ext_inode_hdr(inode); ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0); if (ret) @@ -2134,155 +2125,6 @@ cleanup: return err; } -static int ext4_fill_fiemap_extents(struct inode *inode, - ext4_lblk_t block, ext4_lblk_t num, - struct fiemap_extent_info *fieinfo) -{ - struct ext4_ext_path *path = NULL; - struct ext4_extent *ex; - struct extent_status es; - ext4_lblk_t next, next_del, start = 0, end = 0; - ext4_lblk_t last = block + num; - int exists, depth = 0, err = 0; - unsigned int flags = 0; - unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; - - while (block < last && block != EXT_MAX_BLOCKS) { - num = last - block; - /* find extent for this block */ - down_read(&EXT4_I(inode)->i_data_sem); - - path = ext4_find_extent(inode, block, &path, 0); - if (IS_ERR(path)) { - up_read(&EXT4_I(inode)->i_data_sem); - err = PTR_ERR(path); - path = NULL; - break; - } - - depth = ext_depth(inode); - if (unlikely(path[depth].p_hdr == NULL)) { - up_read(&EXT4_I(inode)->i_data_sem); - EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - err = -EFSCORRUPTED; - break; - } - ex = path[depth].p_ext; - next = ext4_ext_next_allocated_block(path); - - flags = 0; - exists = 0; - if (!ex) { - /* there is no extent yet, so try to allocate - * all requested space */ - start = block; - end = block + num; - } else if (le32_to_cpu(ex->ee_block) > block) { - /* need to allocate space before found extent */ - start = block; - end = le32_to_cpu(ex->ee_block); - if (block + num < end) - end = block + num; - } else if (block >= le32_to_cpu(ex->ee_block) - + ext4_ext_get_actual_len(ex)) { - /* need to allocate space after found extent */ - start = block; - end = block + num; - if (end >= next) - end = next; - } else if (block >= le32_to_cpu(ex->ee_block)) { - /* - * some part of requested space is covered - * by found extent - */ - start = block; - end = le32_to_cpu(ex->ee_block) - + ext4_ext_get_actual_len(ex); - if (block + num < end) - end = block + num; - exists = 1; - } else { - BUG(); - } - BUG_ON(end <= start); - - if (!exists) { - es.es_lblk = start; - es.es_len = end - start; - es.es_pblk = 0; - } else { - es.es_lblk = le32_to_cpu(ex->ee_block); - es.es_len = ext4_ext_get_actual_len(ex); - es.es_pblk = ext4_ext_pblock(ex); - if (ext4_ext_is_unwritten(ex)) - flags |= FIEMAP_EXTENT_UNWRITTEN; - } - - /* - * Find delayed extent and update es accordingly. We call - * it even in !exists case to find out whether es is the - * last existing extent or not. - */ - next_del = ext4_find_delayed_extent(inode, &es); - if (!exists && next_del) { - exists = 1; - flags |= (FIEMAP_EXTENT_DELALLOC | - FIEMAP_EXTENT_UNKNOWN); - } - up_read(&EXT4_I(inode)->i_data_sem); - - if (unlikely(es.es_len == 0)) { - EXT4_ERROR_INODE(inode, "es.es_len == 0"); - err = -EFSCORRUPTED; - break; - } - - /* - * This is possible iff next == next_del == EXT_MAX_BLOCKS. - * we need to check next == EXT_MAX_BLOCKS because it is - * possible that an extent is with unwritten and delayed - * status due to when an extent is delayed allocated and - * is allocated by fallocate status tree will track both of - * them in a extent. - * - * So we could return a unwritten and delayed extent, and - * its block is equal to 'next'. - */ - if (next == next_del && next == EXT_MAX_BLOCKS) { - flags |= FIEMAP_EXTENT_LAST; - if (unlikely(next_del != EXT_MAX_BLOCKS || - next != EXT_MAX_BLOCKS)) { - EXT4_ERROR_INODE(inode, - "next extent == %u, next " - "delalloc extent = %u", - next, next_del); - err = -EFSCORRUPTED; - break; - } - } - - if (exists) { - err = fiemap_fill_next_extent(fieinfo, - (__u64)es.es_lblk << blksize_bits, - (__u64)es.es_pblk << blksize_bits, - (__u64)es.es_len << blksize_bits, - flags); - if (err < 0) - break; - if (err == 1) { - err = 0; - break; - } - } - - block = es.es_lblk + es.es_len; - } - - ext4_ext_drop_refs(path); - kfree(path); - return err; -} - static int ext4_fill_es_cache_info(struct inode *inode, ext4_lblk_t block, ext4_lblk_t num, struct fiemap_extent_info *fieinfo) @@ -3874,64 +3716,11 @@ out: return err; } -/* - * Handle EOFBLOCKS_FL flag, clearing it if necessary - */ -static int check_eofblocks_fl(handle_t *handle, struct inode *inode, - ext4_lblk_t lblk, - struct ext4_ext_path *path, - unsigned int len) -{ - int i, depth; - struct ext4_extent_header *eh; - struct ext4_extent *last_ex; - - if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) - return 0; - - depth = ext_depth(inode); - eh = path[depth].p_hdr; - - /* - * We're going to remove EOFBLOCKS_FL entirely in future so we - * do not care for this case anymore. Simply remove the flag - * if there are no extents. - */ - if (unlikely(!eh->eh_entries)) - goto out; - last_ex = EXT_LAST_EXTENT(eh); - /* - * We should clear the EOFBLOCKS_FL flag if we are writing the - * last block in the last extent in the file. We test this by - * first checking to see if the caller to - * ext4_ext_get_blocks() was interested in the last block (or - * a block beyond the last block) in the current extent. If - * this turns out to be false, we can bail out from this - * function immediately. - */ - if (lblk + len < le32_to_cpu(last_ex->ee_block) + - ext4_ext_get_actual_len(last_ex)) - return 0; - /* - * If the caller does appear to be planning to write at or - * beyond the end of the current extent, we then test to see - * if the current extent is the last extent in the file, by - * checking to make sure it was reached via the rightmost node - * at each level of the tree. - */ - for (i = depth-1; i >= 0; i--) - if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) - return 0; -out: - ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); - return ext4_mark_inode_dirty(handle, inode); -} - static int convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, - unsigned int allocated) + unsigned int *allocated) { struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; @@ -3991,14 +3780,12 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, ext4_ext_show_leaf(inode, path); ext4_update_inode_fsync_trans(handle, inode, 1); - err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len); - if (err) - return err; + map->m_flags |= EXT4_MAP_UNWRITTEN; - if (allocated > map->m_len) - allocated = map->m_len; - map->m_len = allocated; - return allocated; + if (*allocated > map->m_len) + *allocated = map->m_len; + map->m_len = *allocated; + return 0; } static int @@ -4007,7 +3794,9 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, int flags, unsigned int allocated, ext4_fsblk_t newblock) { +#ifdef EXT_DEBUG struct ext4_ext_path *path = *ppath; +#endif int ret = 0; int err = 0; @@ -4047,11 +3836,9 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, } ret = ext4_convert_unwritten_extents_endio(handle, inode, map, ppath); - if (ret >= 0) { + if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); - err = check_eofblocks_fl(handle, inode, map->m_lblk, - path, map->m_len); - } else + else err = ret; map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = newblock; @@ -4100,12 +3887,6 @@ out: map_out: map->m_flags |= EXT4_MAP_MAPPED; - if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) { - err = check_eofblocks_fl(handle, inode, map->m_lblk, path, - map->m_len); - if (err < 0) - goto out2; - } out1: if (allocated > map->m_len) allocated = map->m_len; @@ -4244,12 +4025,11 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_extent newex, *ex, *ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0; - int free_on_err = 0, err = 0, depth, ret; + int err = 0, depth, ret; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; ext4_lblk_t cluster_offset; - bool map_from_cluster = false; ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); @@ -4308,12 +4088,12 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, */ if ((!ext4_ext_is_unwritten(ex)) && (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { - allocated = convert_initialized_extent( - handle, inode, map, &path, - allocated); + err = convert_initialized_extent(handle, + inode, map, &path, &allocated); goto out2; - } else if (!ext4_ext_is_unwritten(ex)) + } else if (!ext4_ext_is_unwritten(ex)) { goto out; + } ret = ext4_ext_handle_unwritten_extents( handle, inode, map, &path, flags, @@ -4364,7 +4144,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; - map_from_cluster = true; goto got_allocated_blocks; } @@ -4385,7 +4164,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; - map_from_cluster = true; goto got_allocated_blocks; } @@ -4442,7 +4220,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, goto out2; ext_debug("allocate new block: goal %llu, found %llu/%u\n", ar.goal, newblock, allocated); - free_on_err = 1; allocated_clusters = ar.len; ar.len = EXT4_C2B(sbi, ar.len) - offset; if (ar.len > allocated) @@ -4453,28 +4230,28 @@ got_allocated_blocks: ext4_ext_store_pblock(&newex, newblock + offset); newex.ee_len = cpu_to_le16(ar.len); /* Mark unwritten */ - if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ + if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { ext4_ext_mark_unwritten(&newex); map->m_flags |= EXT4_MAP_UNWRITTEN; } - err = 0; - if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) - err = check_eofblocks_fl(handle, inode, map->m_lblk, - path, ar.len); - if (!err) - err = ext4_ext_insert_extent(handle, inode, &path, - &newex, flags); - - if (err && free_on_err) { - int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? - EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; - /* free data blocks we just allocated */ - /* not a good idea to call discard here directly, - * but otherwise we'd need to call it every free() */ - ext4_discard_preallocations(inode); - ext4_free_blocks(handle, inode, NULL, newblock, - EXT4_C2B(sbi, allocated_clusters), fb_flags); + err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags); + if (err) { + if (allocated_clusters) { + int fb_flags = 0; + + /* + * free data blocks we just allocated. + * not a good idea to call discard here directly, + * but otherwise we'd need to call it every free(). + */ + ext4_discard_preallocations(inode); + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE; + ext4_free_blocks(handle, inode, NULL, newblock, + EXT4_C2B(sbi, allocated_clusters), + fb_flags); + } goto out2; } @@ -4491,7 +4268,7 @@ got_allocated_blocks: * clusters discovered to be delayed allocated. Once allocated, a * cluster is not included in the reserved count. */ - if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) { + if (test_opt(inode->i_sb, DELALLOC) && allocated_clusters) { if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { /* * When allocating delayed allocated clusters, simply @@ -4645,10 +4422,6 @@ retry: epos = new_size; if (ext4_update_inode_size(inode, epos) & 0x1) inode->i_mtime = inode->i_ctime; - } else { - if (epos > inode->i_size) - ext4_set_inode_flag(inode, - EXT4_INODE_EOFBLOCKS); } ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); @@ -4802,16 +4575,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, } inode->i_mtime = inode->i_ctime = current_time(inode); - if (new_size) { + if (new_size) ext4_update_inode_size(inode, new_size); - } else { - /* - * Mark that we allocate beyond EOF so the subsequent truncate - * can proceed even if the new size is the same as i_size. - */ - if (offset + len > inode->i_size) - ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); - } ext4_mark_inode_dirty(handle, inode); /* Zero out partial block at the edges of the range */ @@ -5009,64 +4774,13 @@ int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end) return ret < 0 ? ret : err; } -/* - * If newes is not existing extent (newes->ec_pblk equals zero) find - * delayed extent at start of newes and update newes accordingly and - * return start of the next delayed extent. - * - * If newes is existing extent (newes->ec_pblk is not equal zero) - * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed - * extent found. Leave newes unmodified. - */ -static int ext4_find_delayed_extent(struct inode *inode, - struct extent_status *newes) -{ - struct extent_status es; - ext4_lblk_t block, next_del; - - if (newes->es_pblk == 0) { - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, - newes->es_lblk, - newes->es_lblk + newes->es_len - 1, - &es); - - /* - * No extent in extent-tree contains block @newes->es_pblk, - * then the block may stay in 1)a hole or 2)delayed-extent. - */ - if (es.es_len == 0) - /* A hole found. */ - return 0; - - if (es.es_lblk > newes->es_lblk) { - /* A hole found. */ - newes->es_len = min(es.es_lblk - newes->es_lblk, - newes->es_len); - return 0; - } - - newes->es_len = es.es_lblk + es.es_len - newes->es_lblk; - } - - block = newes->es_lblk + newes->es_len; - ext4_es_find_extent_range(inode, &ext4_es_is_delayed, block, - EXT_MAX_BLOCKS, &es); - if (es.es_len == 0) - next_del = EXT_MAX_BLOCKS; - else - next_del = es.es_lblk; - - return next_del; -} - -static int ext4_xattr_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo) +static int ext4_iomap_xattr_fiemap(struct inode *inode, struct iomap *iomap) { __u64 physical = 0; - __u64 length; - __u32 flags = FIEMAP_EXTENT_LAST; + __u64 length = 0; int blockbits = inode->i_sb->s_blocksize_bits; int error = 0; + u16 iomap_type; /* in-inode? */ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { @@ -5081,40 +4795,49 @@ static int ext4_xattr_fiemap(struct inode *inode, EXT4_I(inode)->i_extra_isize; physical += offset; length = EXT4_SB(inode->i_sb)->s_inode_size - offset; - flags |= FIEMAP_EXTENT_DATA_INLINE; brelse(iloc.bh); - } else { /* external block */ + iomap_type = IOMAP_INLINE; + } else if (EXT4_I(inode)->i_file_acl) { /* external block */ physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits; length = inode->i_sb->s_blocksize; + iomap_type = IOMAP_MAPPED; + } else { + /* no in-inode or external block for xattr, so return -ENOENT */ + error = -ENOENT; + goto out; } - if (physical) - error = fiemap_fill_next_extent(fieinfo, 0, physical, - length, flags); - return (error < 0 ? error : 0); + iomap->addr = physical; + iomap->offset = 0; + iomap->length = length; + iomap->type = iomap_type; + iomap->flags = 0; +out: + return error; } -static int _ext4_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len, - int (*fill)(struct inode *, ext4_lblk_t, - ext4_lblk_t, - struct fiemap_extent_info *)) +static int ext4_iomap_xattr_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned flags, + struct iomap *iomap, struct iomap *srcmap) { - ext4_lblk_t start_blk; - u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR; + int error; - int error = 0; - - if (ext4_has_inline_data(inode)) { - int has_inline = 1; + error = ext4_iomap_xattr_fiemap(inode, iomap); + if (error == 0 && (offset >= iomap->length)) + error = -ENOENT; + return error; +} - error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline, - start, len); +static const struct iomap_ops ext4_iomap_xattr_ops = { + .iomap_begin = ext4_iomap_xattr_begin, +}; - if (has_inline) - return error; - } +static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len, bool from_es_cache) +{ + ext4_lblk_t start_blk; + u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR; + int error = 0; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { error = ext4_ext_precache(inode); @@ -5123,19 +4846,19 @@ static int _ext4_fiemap(struct inode *inode, fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } - /* fallback to generic here if not in extents fmt */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && - fill == ext4_fill_fiemap_extents) - return generic_block_fiemap(inode, fieinfo, start, len, - ext4_get_block); - - if (fill == ext4_fill_es_cache_info) + if (from_es_cache) ext4_fiemap_flags &= FIEMAP_FLAG_XATTR; + if (fiemap_check_flags(fieinfo, ext4_fiemap_flags)) return -EBADR; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { - error = ext4_xattr_fiemap(inode, fieinfo); + fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; + error = iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_xattr_ops); + } else if (!from_es_cache) { + error = iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_report_ops); } else { ext4_lblk_t len_blks; __u64 last_blk; @@ -5150,7 +4873,8 @@ static int _ext4_fiemap(struct inode *inode, * Walk the extent tree gathering extent information * and pushing extents back to the user. */ - error = fill(inode, start_blk, len_blks, fieinfo); + error = ext4_fill_es_cache_info(inode, start_blk, len_blks, + fieinfo); } return error; } @@ -5158,8 +4882,7 @@ static int _ext4_fiemap(struct inode *inode, int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { - return _ext4_fiemap(inode, fieinfo, start, len, - ext4_fill_fiemap_extents); + return _ext4_fiemap(inode, fieinfo, start, len, false); } int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -5175,8 +4898,7 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, return 0; } - return _ext4_fiemap(inode, fieinfo, start, len, - ext4_fill_es_cache_info); + return _ext4_fiemap(inode, fieinfo, start, len, true); } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 5f225881176b..0d624250a62b 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -872,6 +872,7 @@ const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, + .iopoll = iomap_dio_iopoll, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f95ee99091e4..b420c9dc444d 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -196,10 +196,9 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO); if (!buffer_uptodate(bh)) { put_bh(bh); - ext4_set_errno(sb, EIO); - ext4_error(sb, "Cannot read inode bitmap - " - "block_group = %u, inode_bitmap = %llu", - block_group, bitmap_blk); + ext4_error_err(sb, EIO, "Cannot read inode bitmap - " + "block_group = %u, inode_bitmap = %llu", + block_group, bitmap_blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); return ERR_PTR(-EIO); @@ -712,21 +711,34 @@ out: static int find_inode_bit(struct super_block *sb, ext4_group_t group, struct buffer_head *bitmap, unsigned long *ino) { + bool check_recently_deleted = EXT4_SB(sb)->s_journal == NULL; + unsigned long recently_deleted_ino = EXT4_INODES_PER_GROUP(sb); + next: *ino = ext4_find_next_zero_bit((unsigned long *) bitmap->b_data, EXT4_INODES_PER_GROUP(sb), *ino); if (*ino >= EXT4_INODES_PER_GROUP(sb)) - return 0; + goto not_found; - if ((EXT4_SB(sb)->s_journal == NULL) && - recently_deleted(sb, group, *ino)) { + if (check_recently_deleted && recently_deleted(sb, group, *ino)) { + recently_deleted_ino = *ino; *ino = *ino + 1; if (*ino < EXT4_INODES_PER_GROUP(sb)) goto next; - return 0; + goto not_found; } - + return 1; +not_found: + if (recently_deleted_ino >= EXT4_INODES_PER_GROUP(sb)) + return 0; + /* + * Not reusing recently deleted inodes is mostly a preference. We don't + * want to report ENOSPC or skew allocation patterns because of that. + * So return even recently deleted inode if we could find better in the + * given range. + */ + *ino = recently_deleted_ino; return 1; } @@ -1231,9 +1243,9 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { err = PTR_ERR(inode); - ext4_set_errno(sb, -err); - ext4_error(sb, "couldn't read orphan inode %lu (err %d)", - ino, err); + ext4_error_err(sb, -err, + "couldn't read orphan inode %lu (err %d)", + ino, err); return inode; } diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 569fc68e8975..107f0043f67f 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1019,7 +1019,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, * (should be rare). */ if (!bh) { - EXT4_ERROR_INODE_BLOCK(inode, nr, + ext4_error_inode_block(inode, nr, EIO, "Read failure"); continue; } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index fad82d08fca5..f35e289e17aa 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -98,10 +98,9 @@ int ext4_get_max_inline_size(struct inode *inode) error = ext4_get_inode_loc(inode, &iloc); if (error) { - ext4_set_errno(inode->i_sb, -error); - ext4_error_inode(inode, __func__, __LINE__, 0, - "can't get inode location %lu", - inode->i_ino); + ext4_error_inode_err(inode, __func__, __LINE__, 0, -error, + "can't get inode location %lu", + inode->i_ino); return 0; } @@ -1762,9 +1761,9 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) err = ext4_get_inode_loc(dir, &iloc); if (err) { - ext4_set_errno(dir->i_sb, -err); - EXT4_ERROR_INODE(dir, "error %d getting inode %lu block", - err, dir->i_ino); + EXT4_ERROR_INODE_ERR(dir, -err, + "error %d getting inode %lu block", + err, dir->i_ino); return true; } @@ -1857,47 +1856,6 @@ out: return error; } -int ext4_inline_data_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, - int *has_inline, __u64 start, __u64 len) -{ - __u64 physical = 0; - __u64 inline_len; - __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED | - FIEMAP_EXTENT_LAST; - int error = 0; - struct ext4_iloc iloc; - - down_read(&EXT4_I(inode)->xattr_sem); - if (!ext4_has_inline_data(inode)) { - *has_inline = 0; - goto out; - } - inline_len = min_t(size_t, ext4_get_inline_size(inode), - i_size_read(inode)); - if (start >= inline_len) - goto out; - if (start + len < inline_len) - inline_len = start + len; - inline_len -= start; - - error = ext4_get_inode_loc(inode, &iloc); - if (error) - goto out; - - physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; - physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; - physical += offsetof(struct ext4_inode, i_block); - - brelse(iloc.bh); -out: - up_read(&EXT4_I(inode)->xattr_sem); - if (physical) - error = fiemap_fill_next_extent(fieinfo, start, physical, - inline_len, flags); - return (error < 0 ? error : 0); -} - int ext4_inline_data_truncate(struct inode *inode, int *has_inline) { handle_t *handle; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fa0ff78dc033..e416096fc081 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -269,10 +269,9 @@ void ext4_evict_inode(struct inode *inode) if (inode->i_blocks) { err = ext4_truncate(inode); if (err) { - ext4_set_errno(inode->i_sb, -err); - ext4_error(inode->i_sb, - "couldn't truncate inode %lu (err %d)", - inode->i_ino, err); + ext4_error_err(inode->i_sb, -err, + "couldn't truncate inode %lu (err %d)", + inode->i_ino, err); goto stop_handle; } } @@ -2478,10 +2477,9 @@ update_disksize: up_write(&EXT4_I(inode)->i_data_sem); err2 = ext4_mark_inode_dirty(handle, inode); if (err2) { - ext4_set_errno(inode->i_sb, -err2); - ext4_error(inode->i_sb, - "Failed to mark inode %lu dirty", - inode->i_ino); + ext4_error_err(inode->i_sb, -err2, + "Failed to mark inode %lu dirty", + inode->i_ino); } if (!err) err = err2; @@ -3212,7 +3210,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) return 0; } - return generic_block_bmap(mapping, block, ext4_get_block); + return iomap_bmap(mapping, block, &ext4_iomap_ops); } static int ext4_readpage(struct file *file, struct page *page) @@ -3333,6 +3331,10 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, iomap->offset = (u64) map->m_lblk << blkbits; iomap->length = (u64) map->m_len << blkbits; + if ((map->m_flags & EXT4_MAP_MAPPED) && + !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + iomap->flags |= IOMAP_F_MERGED; + /* * Flags passed to ext4_map_blocks() for direct I/O writes can result * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits @@ -3542,12 +3544,28 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; + /* + * Fiemap callers may call for offset beyond s_bitmap_maxbytes. + * So handle it here itself instead of querying ext4_map_blocks(). + * Since ext4_map_blocks() will warn about it and will return + * -EIO error. + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (offset >= sbi->s_bitmap_maxbytes) { + map.m_flags = 0; + goto set_iomap; + } + } + ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; if (ret == 0) delalloc = ext4_iomap_is_delalloc(inode, &map); +set_iomap: ext4_set_iomap(inode, iomap, &map, offset, length); if (delalloc && iomap->type == IOMAP_HOLE) iomap->type = IOMAP_DELALLOC; @@ -4144,8 +4162,6 @@ int ext4_truncate(struct inode *inode) if (!ext4_can_truncate(inode)) return 0; - ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); - if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); @@ -4364,8 +4380,7 @@ make_io: wait_on_buffer(bh); if (!buffer_uptodate(bh)) { simulate_eio: - ext4_set_errno(inode->i_sb, EIO); - EXT4_ERROR_INODE_BLOCK(inode, block, + ext4_error_inode_block(inode, block, EIO, "unable to read itable block"); brelse(bh); return -EIO; @@ -4517,7 +4532,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) return ERR_PTR(-ESTALE); - __ext4_error(sb, function, line, + __ext4_error(sb, function, line, EFSCORRUPTED, 0, "inode #%lu: comm %s: iget: illegal inode #", ino, current->comm); return ERR_PTR(-EFSCORRUPTED); @@ -4580,9 +4595,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (!ext4_inode_csum_verify(inode, raw_inode, ei) || ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) { - ext4_set_errno(inode->i_sb, EFSBADCRC); - ext4_error_inode(inode, function, line, 0, - "iget: checksum invalid"); + ext4_error_inode_err(inode, function, line, 0, EFSBADCRC, + "iget: checksum invalid"); ret = -EFSBADCRC; goto bad_inode; } @@ -4812,7 +4826,7 @@ static int ext4_inode_blocks_set(handle_t *handle, struct ext4_inode_info *ei) { struct inode *inode = &(ei->vfs_inode); - u64 i_blocks = inode->i_blocks; + u64 i_blocks = READ_ONCE(inode->i_blocks); struct super_block *sb = inode->i_sb; if (i_blocks <= ~0U) { @@ -4982,7 +4996,7 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); - if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) { + if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) { ext4_isize_set(raw_inode, ei->i_disksize); need_datasync = 1; } @@ -5131,9 +5145,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { - ext4_set_errno(inode->i_sb, EIO); - EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, - "IO error syncing inode"); + ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO, + "IO error syncing inode"); err = -EIO; } brelse(iloc.bh); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0c1d1720cf1a..bfc1281fc4cb 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -327,18 +327,6 @@ static int ext4_ioctl_setflags(struct inode *inode, if ((flags ^ oldflags) & EXT4_EXTENTS_FL) migrate = 1; - if (flags & EXT4_EOFBLOCKS_FL) { - /* we don't support adding EOFBLOCKS flag */ - if (!(oldflags & EXT4_EOFBLOCKS_FL)) { - err = -EOPNOTSUPP; - goto flags_out; - } - } else if (oldflags & EXT4_EOFBLOCKS_FL) { - err = ext4_truncate(inode); - if (err) - goto flags_out; - } - if ((flags ^ oldflags) & EXT4_CASEFOLD_FL) { if (!ext4_has_feature_casefold(sb)) { err = -EOPNOTSUPP; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 51a78eb65f3c..87c85be4c12e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1901,8 +1901,15 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, BUG_ON(buddy == NULL); k = mb_find_next_zero_bit(buddy, max, 0); - BUG_ON(k >= max); - + if (k >= max) { + ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0, + "%d free clusters of order %d. But found 0", + grp->bb_counters[i], i); + ext4_mark_group_bitmap_corrupted(ac->ac_sb, + e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + break; + } ac->ac_found++; ac->ac_b_ex.fe_len = 1 << i; @@ -3914,9 +3921,9 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); - ext4_set_errno(sb, -err); - ext4_error(sb, "Error %d reading block bitmap for %u", - err, group); + ext4_error_err(sb, -err, + "Error %d reading block bitmap for %u", + err, group); return 0; } @@ -4083,18 +4090,16 @@ repeat: err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { - ext4_set_errno(sb, -err); - ext4_error(sb, "Error %d loading buddy information for %u", - err, group); + ext4_error_err(sb, -err, "Error %d loading buddy information for %u", + err, group); continue; } bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); - ext4_set_errno(sb, -err); - ext4_error(sb, "Error %d reading block bitmap for %u", - err, group); + ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", + err, group); ext4_mb_unload_buddy(&e4b); continue; } @@ -4302,7 +4307,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], - pa_inode_list) { + pa_inode_list, + lockdep_is_held(&lg->lg_prealloc_lock)) { spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { /* @@ -4347,9 +4353,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { - ext4_set_errno(sb, -err); - ext4_error(sb, "Error %d loading buddy information for %u", - err, group); + ext4_error_err(sb, -err, "Error %d loading buddy information for %u", + err, group); continue; } ext4_lock_group(sb, group); @@ -4386,7 +4391,8 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) /* Add the prealloc space to lg */ spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], - pa_inode_list) { + pa_inode_list, + lockdep_is_held(&lg->lg_prealloc_lock)) { spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted) { spin_unlock(&tmp_pa->pa_lock); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 87f7551c5132..d34cb8c46655 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -175,8 +175,8 @@ static int kmmpd(void *data) */ if (retval) { if ((failed_writes % 60) == 0) { - ext4_set_errno(sb, -retval); - ext4_error(sb, "Error writing to MMP block"); + ext4_error_err(sb, -retval, + "Error writing to MMP block"); } failed_writes++; } @@ -208,9 +208,9 @@ static int kmmpd(void *data) retval = read_mmp_block(sb, &bh_check, mmp_block); if (retval) { - ext4_set_errno(sb, -retval); - ext4_error(sb, "error reading MMP data: %d", - retval); + ext4_error_err(sb, -retval, + "error reading MMP data: %d", + retval); goto exit_thread; } @@ -222,8 +222,7 @@ static int kmmpd(void *data) "Error while updating MMP info. " "The filesystem seems to have been" " multiply mounted."); - ext4_set_errno(sb, EBUSY); - ext4_error(sb, "abort"); + ext4_error_err(sb, EBUSY, "abort"); put_bh(bh_check); retval = -EBUSY; goto exit_thread; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 30ce3dc69378..1ed86fb6c302 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -422,8 +422,8 @@ repair_branches: block_len_in_page, 0, &err2); ext4_double_up_write_data_sem(orig_inode, donor_inode); if (replaced_count != block_len_in_page) { - EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), - "Unable to copy data block," + ext4_error_inode_block(orig_inode, (sector_t)(orig_blk_offset), + EIO, "Unable to copy data block," " data will be lost."); *err = -EIO; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b05ea72f38fd..a8aca4772aaa 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -160,9 +160,9 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { - ext4_set_errno(inode->i_sb, EFSBADCRC); - ext4_error_inode(inode, func, line, block, - "Directory index failed checksum"); + ext4_error_inode_err(inode, func, line, block, + EFSBADCRC, + "Directory index failed checksum"); brelse(bh); return ERR_PTR(-EFSBADCRC); } @@ -172,9 +172,9 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, !ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_CRC)) set_buffer_verified(bh); else { - ext4_set_errno(inode->i_sb, EFSBADCRC); - ext4_error_inode(inode, func, line, block, - "Directory block failed checksum"); + ext4_error_inode_err(inode, func, line, block, + EFSBADCRC, + "Directory block failed checksum"); brelse(bh); return ERR_PTR(-EFSBADCRC); } @@ -233,13 +233,13 @@ struct dx_root u8 unused_flags; } info; - struct dx_entry entries[0]; + struct dx_entry entries[]; }; struct dx_node { struct fake_dirent fake; - struct dx_entry entries[0]; + struct dx_entry entries[]; }; @@ -1532,9 +1532,9 @@ restart: goto next; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - ext4_set_errno(sb, EIO); - EXT4_ERROR_INODE(dir, "reading directory lblock %lu", - (unsigned long) block); + EXT4_ERROR_INODE_ERR(dir, EIO, + "reading directory lblock %lu", + (unsigned long) block); brelse(bh); ret = ERR_PTR(-EIO); goto cleanup_and_exit; @@ -1543,9 +1543,9 @@ restart: !is_dx_internal_node(dir, block, (struct ext4_dir_entry *)bh->b_data) && !ext4_dirblock_csum_verify(dir, bh)) { - ext4_set_errno(sb, EFSBADCRC); - EXT4_ERROR_INODE(dir, "checksumming directory " - "block %lu", (unsigned long)block); + EXT4_ERROR_INODE_ERR(dir, EFSBADCRC, + "checksumming directory " + "block %lu", (unsigned long)block); brelse(bh); ret = ERR_PTR(-EFSBADCRC); goto cleanup_and_exit; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c8dff4c68141..9728e7b0e84f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -335,10 +335,12 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) -static void __save_error_info(struct super_block *sb, const char *func, - unsigned int line) +static void __save_error_info(struct super_block *sb, int error, + __u32 ino, __u64 block, + const char *func, unsigned int line) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; + int err; EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (bdev_read_only(sb->s_bdev)) @@ -347,8 +349,62 @@ static void __save_error_info(struct super_block *sb, const char *func, ext4_update_tstamp(es, s_last_error_time); strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); es->s_last_error_line = cpu_to_le32(line); - if (es->s_last_error_errcode == 0) - es->s_last_error_errcode = EXT4_ERR_EFSCORRUPTED; + es->s_last_error_ino = cpu_to_le32(ino); + es->s_last_error_block = cpu_to_le64(block); + switch (error) { + case EIO: + err = EXT4_ERR_EIO; + break; + case ENOMEM: + err = EXT4_ERR_ENOMEM; + break; + case EFSBADCRC: + err = EXT4_ERR_EFSBADCRC; + break; + case 0: + case EFSCORRUPTED: + err = EXT4_ERR_EFSCORRUPTED; + break; + case ENOSPC: + err = EXT4_ERR_ENOSPC; + break; + case ENOKEY: + err = EXT4_ERR_ENOKEY; + break; + case EROFS: + err = EXT4_ERR_EROFS; + break; + case EFBIG: + err = EXT4_ERR_EFBIG; + break; + case EEXIST: + err = EXT4_ERR_EEXIST; + break; + case ERANGE: + err = EXT4_ERR_ERANGE; + break; + case EOVERFLOW: + err = EXT4_ERR_EOVERFLOW; + break; + case EBUSY: + err = EXT4_ERR_EBUSY; + break; + case ENOTDIR: + err = EXT4_ERR_ENOTDIR; + break; + case ENOTEMPTY: + err = EXT4_ERR_ENOTEMPTY; + break; + case ESHUTDOWN: + err = EXT4_ERR_ESHUTDOWN; + break; + case EFAULT: + err = EXT4_ERR_EFAULT; + break; + default: + err = EXT4_ERR_UNKNOWN; + } + es->s_last_error_errcode = err; if (!es->s_first_error_time) { es->s_first_error_time = es->s_last_error_time; es->s_first_error_time_hi = es->s_last_error_time_hi; @@ -368,11 +424,13 @@ static void __save_error_info(struct super_block *sb, const char *func, le32_add_cpu(&es->s_error_count, 1); } -static void save_error_info(struct super_block *sb, const char *func, - unsigned int line) +static void save_error_info(struct super_block *sb, int error, + __u32 ino, __u64 block, + const char *func, unsigned int line) { - __save_error_info(sb, func, line); - ext4_commit_super(sb, 1); + __save_error_info(sb, error, ino, block, func, line); + if (!bdev_read_only(sb->s_bdev)) + ext4_commit_super(sb, 1); } /* @@ -477,7 +535,8 @@ static void ext4_handle_error(struct super_block *sb) "EXT4-fs error") void __ext4_error(struct super_block *sb, const char *function, - unsigned int line, const char *fmt, ...) + unsigned int line, int error, __u64 block, + const char *fmt, ...) { struct va_format vaf; va_list args; @@ -495,24 +554,21 @@ void __ext4_error(struct super_block *sb, const char *function, sb->s_id, function, line, current->comm, &vaf); va_end(args); } - save_error_info(sb, function, line); + save_error_info(sb, error, 0, block, function, line); ext4_handle_error(sb); } void __ext4_error_inode(struct inode *inode, const char *function, - unsigned int line, ext4_fsblk_t block, + unsigned int line, ext4_fsblk_t block, int error, const char *fmt, ...) { va_list args; struct va_format vaf; - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return; trace_ext4_error(inode->i_sb, function, line); - es->s_last_error_ino = cpu_to_le32(inode->i_ino); - es->s_last_error_block = cpu_to_le64(block); if (ext4_error_ratelimit(inode->i_sb)) { va_start(args, fmt); vaf.fmt = fmt; @@ -529,7 +585,8 @@ void __ext4_error_inode(struct inode *inode, const char *function, current->comm, &vaf); va_end(args); } - save_error_info(inode->i_sb, function, line); + save_error_info(inode->i_sb, error, inode->i_ino, block, + function, line); ext4_handle_error(inode->i_sb); } @@ -548,7 +605,6 @@ void __ext4_error_file(struct file *file, const char *function, trace_ext4_error(inode->i_sb, function, line); es = EXT4_SB(inode->i_sb)->s_es; - es->s_last_error_ino = cpu_to_le32(inode->i_ino); if (ext4_error_ratelimit(inode->i_sb)) { path = file_path(file, pathname, sizeof(pathname)); if (IS_ERR(path)) @@ -570,7 +626,8 @@ void __ext4_error_file(struct file *file, const char *function, current->comm, path, &vaf); va_end(args); } - save_error_info(inode->i_sb, function, line); + save_error_info(inode->i_sb, EFSCORRUPTED, inode->i_ino, block, + function, line); ext4_handle_error(inode->i_sb); } @@ -614,66 +671,6 @@ const char *ext4_decode_error(struct super_block *sb, int errno, return errstr; } -void ext4_set_errno(struct super_block *sb, int err) -{ - if (err < 0) - err = -err; - - switch (err) { - case EIO: - err = EXT4_ERR_EIO; - break; - case ENOMEM: - err = EXT4_ERR_ENOMEM; - break; - case EFSBADCRC: - err = EXT4_ERR_EFSBADCRC; - break; - case EFSCORRUPTED: - err = EXT4_ERR_EFSCORRUPTED; - break; - case ENOSPC: - err = EXT4_ERR_ENOSPC; - break; - case ENOKEY: - err = EXT4_ERR_ENOKEY; - break; - case EROFS: - err = EXT4_ERR_EROFS; - break; - case EFBIG: - err = EXT4_ERR_EFBIG; - break; - case EEXIST: - err = EXT4_ERR_EEXIST; - break; - case ERANGE: - err = EXT4_ERR_ERANGE; - break; - case EOVERFLOW: - err = EXT4_ERR_EOVERFLOW; - break; - case EBUSY: - err = EXT4_ERR_EBUSY; - break; - case ENOTDIR: - err = EXT4_ERR_ENOTDIR; - break; - case ENOTEMPTY: - err = EXT4_ERR_ENOTEMPTY; - break; - case ESHUTDOWN: - err = EXT4_ERR_ESHUTDOWN; - break; - case EFAULT: - err = EXT4_ERR_EFAULT; - break; - default: - err = EXT4_ERR_UNKNOWN; - } - EXT4_SB(sb)->s_es->s_last_error_errcode = err; -} - /* __ext4_std_error decodes expected errors from journaling functions * automatically and invokes the appropriate error response. */ @@ -698,8 +695,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, sb->s_id, function, line, errstr); } - ext4_set_errno(sb, -errno); - save_error_info(sb, function, line); + save_error_info(sb, -errno, 0, 0, function, line); ext4_handle_error(sb); } @@ -714,7 +710,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, */ void __ext4_abort(struct super_block *sb, const char *function, - unsigned int line, const char *fmt, ...) + unsigned int line, int error, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -722,7 +718,7 @@ void __ext4_abort(struct super_block *sb, const char *function, if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) return; - save_error_info(sb, function, line); + save_error_info(sb, error, 0, 0, function, line); va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; @@ -741,7 +737,6 @@ void __ext4_abort(struct super_block *sb, const char *function, sb->s_flags |= SB_RDONLY; if (EXT4_SB(sb)->s_journal) jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); - save_error_info(sb, function, line); } if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) { if (EXT4_SB(sb)->s_journal && @@ -815,15 +810,12 @@ __acquires(bitlock) { struct va_format vaf; va_list args; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) return; trace_ext4_error(sb, function, line); - es->s_last_error_ino = cpu_to_le32(ino); - es->s_last_error_block = cpu_to_le64(block); - __save_error_info(sb, function, line); + __save_error_info(sb, EFSCORRUPTED, ino, block, function, line); if (ext4_error_ratelimit(sb)) { va_start(args, fmt); @@ -1024,17 +1016,22 @@ static void ext4_put_super(struct super_block *sb) destroy_workqueue(sbi->rsv_conversion_wq); + /* + * Unregister sysfs before destroying jbd2 journal. + * Since we could still access attr_journal_task attribute via sysfs + * path which could have sbi->s_journal->j_task as NULL + */ + ext4_unregister_sysfs(sb); + if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; if ((err < 0) && !aborted) { - ext4_set_errno(sb, -err); - ext4_abort(sb, "Couldn't clean up the journal"); + ext4_abort(sb, -err, "Couldn't clean up the journal"); } } - ext4_unregister_sysfs(sb); ext4_es_unregister_shrinker(sbi); del_timer_sync(&sbi->s_err_report); ext4_release_system_zone(sb); @@ -2180,6 +2177,14 @@ static int parse_options(char *options, struct super_block *sb, } } #endif + if (test_opt(sb, DIOREAD_NOLOCK)) { + int blocksize = + BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); + if (blocksize < PAGE_SIZE) + ext4_msg(sb, KERN_WARNING, "Warning: mounting with an " + "experimental mount option 'dioread_nolock' " + "for blocksize < PAGE_SIZE"); + } return 1; } @@ -3609,7 +3614,8 @@ int ext4_calculate_overhead(struct super_block *sb) */ if (sbi->s_journal && !sbi->journal_bdev) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); - else if (ext4_has_feature_journal(sb) && !sbi->s_journal) { + else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { + /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); if (j_inode) { j_blocks = j_inode->i_size >> sb->s_blocksize_bits; @@ -3785,7 +3791,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sb, NO_UID32); /* xattr user namespace & acls are now defaulted on */ set_opt(sb, XATTR_USER); - set_opt(sb, DIOREAD_NOLOCK); #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif @@ -3835,6 +3840,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + + if (blocksize == PAGE_SIZE) + set_opt(sb, DIOREAD_NOLOCK); + if (blocksize < EXT4_MIN_BLOCK_SIZE || blocksize > EXT4_MAX_BLOCK_SIZE) { ext4_msg(sb, KERN_ERR, @@ -4157,7 +4166,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || sbi->s_inodes_per_group > blocksize * 8) { ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", - sbi->s_blocks_per_group); + sbi->s_inodes_per_group); goto failed_mount; } sbi->s_itb_per_group = sbi->s_inodes_per_group / @@ -4286,9 +4295,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) EXT4_BLOCKS_PER_GROUP(sb) - 1); do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { - ext4_msg(sb, KERN_WARNING, "groups count too large: %u " + ext4_msg(sb, KERN_WARNING, "groups count too large: %llu " "(block count %llu, first data block %u, " - "blocks per group %lu)", sbi->s_groups_count, + "blocks per group %lu)", blocks_count, ext4_blocks_count(es), le32_to_cpu(es->s_first_data_block), EXT4_BLOCKS_PER_GROUP(sb)); @@ -5433,7 +5442,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) - ext4_abort(sb, "Abort forced by user"); + ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); @@ -5622,10 +5631,8 @@ static int ext4_statfs_project(struct super_block *sb, return PTR_ERR(dquot); spin_lock(&dquot->dq_dqb_lock); - limit = dquot->dq_dqb.dqb_bsoftlimit; - if (dquot->dq_dqb.dqb_bhardlimit && - (!limit || dquot->dq_dqb.dqb_bhardlimit < limit)) - limit = dquot->dq_dqb.dqb_bhardlimit; + limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit, + dquot->dq_dqb.dqb_bhardlimit); limit >>= sb->s_blocksize_bits; if (limit && buf->f_blocks > limit) { @@ -5637,11 +5644,8 @@ static int ext4_statfs_project(struct super_block *sb, (buf->f_blocks - curblock) : 0; } - limit = dquot->dq_dqb.dqb_isoftlimit; - if (dquot->dq_dqb.dqb_ihardlimit && - (!limit || dquot->dq_dqb.dqb_ihardlimit < limit)) - limit = dquot->dq_dqb.dqb_ihardlimit; - + limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, + dquot->dq_dqb.dqb_ihardlimit); if (limit && buf->f_files > limit) { buf->f_files = limit; buf->f_ffree = diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 8cac7d95c3ad..21df43a25328 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -245,7 +245,7 @@ __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, bh->b_data); errout: if (error) - __ext4_error_inode(inode, function, line, 0, + __ext4_error_inode(inode, function, line, 0, -error, "corrupted xattr block %llu", (unsigned long long) bh->b_blocknr); else @@ -269,7 +269,7 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header)); errout: if (error) - __ext4_error_inode(inode, function, line, 0, + __ext4_error_inode(inode, function, line, 0, -error, "corrupted in-inode xattr"); return error; } @@ -2880,9 +2880,9 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, if (IS_ERR(bh)) { error = PTR_ERR(bh); if (error == -EIO) { - ext4_set_errno(inode->i_sb, EIO); - EXT4_ERROR_INODE(inode, "block %llu read error", - EXT4_I(inode)->i_file_acl); + EXT4_ERROR_INODE_ERR(inode, EIO, + "block %llu read error", + EXT4_I(inode)->i_file_acl); } bh = NULL; goto cleanup; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index f39cad2abe2a..ffe21ac77f78 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -48,7 +48,7 @@ struct ext4_xattr_entry { __le32 e_value_inum; /* inode in which the value is stored */ __le32 e_value_size; /* size of attribute value */ __le32 e_hash; /* hash value of name and value */ - char e_name[0]; /* attribute name */ + char e_name[]; /* attribute name */ }; #define EXT4_XATTR_PAD_BITS 2 @@ -118,7 +118,7 @@ struct ext4_xattr_ibody_find { struct ext4_xattr_inode_array { unsigned int count; /* # of used items in the array */ - struct inode *inodes[0]; + struct inode *inodes[]; }; extern const struct xattr_handler ext4_xattr_user_handler; diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index f0faada30f30..bb68d21e1f8c 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -118,3 +118,12 @@ config F2FS_FS_LZ4 default y help Support LZ4 compress algorithm, if unsure, say Y. + +config F2FS_FS_ZSTD + bool "ZSTD compression support" + depends on F2FS_FS_COMPRESSION + select ZSTD_COMPRESS + select ZSTD_DECOMPRESS + default y + help + Support ZSTD compress algorithm, if unsure, say Y. diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 44e84ac5c941..852890b72d6a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -50,9 +50,6 @@ repeat: return page; } -/* - * We guarantee no failure on the returned page. - */ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, bool is_meta) { @@ -206,7 +203,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, } /* - * Readahead CP/NAT/SIT/SSA pages + * Readahead CP/NAT/SIT/SSA/POR pages */ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync) @@ -898,7 +895,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) return -ENOMEM; /* * Finding out valid cp block involves read both - * sets( cp pack1 and cp pack 2) + * sets( cp pack 1 and cp pack 2) */ cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr); cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version); @@ -1250,20 +1247,20 @@ static void unblock_operations(struct f2fs_sb_info *sbi) f2fs_unlock_all(sbi); } -void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) +void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type) { DEFINE_WAIT(wait); for (;;) { prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!get_pages(sbi, F2FS_WB_CP_DATA)) + if (!get_pages(sbi, type)) break; if (unlikely(f2fs_cp_error(sbi))) break; - io_schedule_timeout(5*HZ); + io_schedule_timeout(DEFAULT_IO_TIMEOUT); } finish_wait(&sbi->cp_wait, &wait); } @@ -1301,10 +1298,14 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) else __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || - is_sbi_flag_set(sbi, SBI_IS_RESIZEFS)) + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) __set_ckpt_flags(ckpt, CP_FSCK_FLAG); + if (is_sbi_flag_set(sbi, SBI_IS_RESIZEFS)) + __set_ckpt_flags(ckpt, CP_RESIZEFS_FLAG); + else + __clear_ckpt_flags(ckpt, CP_RESIZEFS_FLAG); + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) __set_ckpt_flags(ckpt, CP_DISABLED_FLAG); else @@ -1384,13 +1385,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT/SIT pages */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); - f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) && - !f2fs_cp_error(sbi)); - /* - * modify checkpoint - * version number is already updated - */ + /* start to update checkpoint, cp ver is already updated previously */ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { @@ -1493,11 +1489,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Here, we have one bio having CP pack except cp pack 2 page */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); - f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) && - !f2fs_cp_error(sbi)); + /* Wait for all dirty meta pages to be submitted for IO */ + f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META); /* wait for previous submitted meta pages writeback */ - f2fs_wait_on_all_pages_writeback(sbi); + f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); /* flush all device cache */ err = f2fs_flush_device_cache(sbi); @@ -1506,7 +1502,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* barrier and flush checkpoint cp pack 2 page if it can */ commit_checkpoint(sbi, ckpt, start_blk); - f2fs_wait_on_all_pages_writeback(sbi); + f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); /* * invalidate intermediate page cache borrowed from meta inode which are @@ -1543,9 +1539,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) return unlikely(f2fs_cp_error(sbi)) ? -EIO : 0; } -/* - * We guarantee that this checkpoint procedure will not fail. - */ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1613,7 +1606,6 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_sit_entries(sbi, cpc); - /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); if (err) f2fs_release_discard_addrs(sbi); @@ -1626,7 +1618,7 @@ stop: if (cpc->reason & CP_RECOVERY) f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver); - /* do checkpoint periodically */ + /* update CP_TIME to trigger checkpoint periodically */ f2fs_update_time(sbi, CP_TIME); trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); out: diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index d8a64be90a50..df7b2d15eacd 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -11,6 +11,7 @@ #include <linux/backing-dev.h> #include <linux/lzo.h> #include <linux/lz4.h> +#include <linux/zstd.h> #include "f2fs.h" #include "node.h" @@ -20,6 +21,8 @@ struct f2fs_compress_ops { int (*init_compress_ctx)(struct compress_ctx *cc); void (*destroy_compress_ctx)(struct compress_ctx *cc); int (*compress_pages)(struct compress_ctx *cc); + int (*init_decompress_ctx)(struct decompress_io_ctx *dic); + void (*destroy_decompress_ctx)(struct decompress_io_ctx *dic); int (*decompress_pages)(struct decompress_io_ctx *dic); }; @@ -52,7 +55,7 @@ bool f2fs_is_compressed_page(struct page *page) } static void f2fs_set_compressed_page(struct page *page, - struct inode *inode, pgoff_t index, void *data, refcount_t *r) + struct inode *inode, pgoff_t index, void *data) { SetPagePrivate(page); set_page_private(page, (unsigned long)data); @@ -60,8 +63,6 @@ static void f2fs_set_compressed_page(struct page *page, /* i_crypto_info and iv index */ page->index = index; page->mapping = inode->i_mapping; - if (r) - refcount_inc(r); } static void f2fs_put_compressed_page(struct page *page) @@ -291,6 +292,165 @@ static const struct f2fs_compress_ops f2fs_lz4_ops = { }; #endif +#ifdef CONFIG_F2FS_FS_ZSTD +#define F2FS_ZSTD_DEFAULT_CLEVEL 1 + +static int zstd_init_compress_ctx(struct compress_ctx *cc) +{ + ZSTD_parameters params; + ZSTD_CStream *stream; + void *workspace; + unsigned int workspace_size; + + params = ZSTD_getParams(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen, 0); + workspace_size = ZSTD_CStreamWorkspaceBound(params.cParams); + + workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode), + workspace_size, GFP_NOFS); + if (!workspace) + return -ENOMEM; + + stream = ZSTD_initCStream(params, 0, workspace, workspace_size); + if (!stream) { + printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_initCStream failed\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, + __func__); + kvfree(workspace); + return -EIO; + } + + cc->private = workspace; + cc->private2 = stream; + + cc->clen = cc->rlen - PAGE_SIZE - COMPRESS_HEADER_SIZE; + return 0; +} + +static void zstd_destroy_compress_ctx(struct compress_ctx *cc) +{ + kvfree(cc->private); + cc->private = NULL; + cc->private2 = NULL; +} + +static int zstd_compress_pages(struct compress_ctx *cc) +{ + ZSTD_CStream *stream = cc->private2; + ZSTD_inBuffer inbuf; + ZSTD_outBuffer outbuf; + int src_size = cc->rlen; + int dst_size = src_size - PAGE_SIZE - COMPRESS_HEADER_SIZE; + int ret; + + inbuf.pos = 0; + inbuf.src = cc->rbuf; + inbuf.size = src_size; + + outbuf.pos = 0; + outbuf.dst = cc->cbuf->cdata; + outbuf.size = dst_size; + + ret = ZSTD_compressStream(stream, &outbuf, &inbuf); + if (ZSTD_isError(ret)) { + printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_compressStream failed, ret: %d\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, + __func__, ZSTD_getErrorCode(ret)); + return -EIO; + } + + ret = ZSTD_endStream(stream, &outbuf); + if (ZSTD_isError(ret)) { + printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_endStream returned %d\n", + KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, + __func__, ZSTD_getErrorCode(ret)); + return -EIO; + } + + cc->clen = outbuf.pos; + return 0; +} + +static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic) +{ + ZSTD_DStream *stream; + void *workspace; + unsigned int workspace_size; + + workspace_size = ZSTD_DStreamWorkspaceBound(MAX_COMPRESS_WINDOW_SIZE); + + workspace = f2fs_kvmalloc(F2FS_I_SB(dic->inode), + workspace_size, GFP_NOFS); + if (!workspace) + return -ENOMEM; + + stream = ZSTD_initDStream(MAX_COMPRESS_WINDOW_SIZE, + workspace, workspace_size); + if (!stream) { + printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_initDStream failed\n", + KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, + __func__); + kvfree(workspace); + return -EIO; + } + + dic->private = workspace; + dic->private2 = stream; + + return 0; +} + +static void zstd_destroy_decompress_ctx(struct decompress_io_ctx *dic) +{ + kvfree(dic->private); + dic->private = NULL; + dic->private2 = NULL; +} + +static int zstd_decompress_pages(struct decompress_io_ctx *dic) +{ + ZSTD_DStream *stream = dic->private2; + ZSTD_inBuffer inbuf; + ZSTD_outBuffer outbuf; + int ret; + + inbuf.pos = 0; + inbuf.src = dic->cbuf->cdata; + inbuf.size = dic->clen; + + outbuf.pos = 0; + outbuf.dst = dic->rbuf; + outbuf.size = dic->rlen; + + ret = ZSTD_decompressStream(stream, &outbuf, &inbuf); + if (ZSTD_isError(ret)) { + printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_compressStream failed, ret: %d\n", + KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id, + __func__, ZSTD_getErrorCode(ret)); + return -EIO; + } + + if (dic->rlen != outbuf.pos) { + printk_ratelimited("%sF2FS-fs (%s): %s ZSTD invalid rlen:%zu, " + "expected:%lu\n", KERN_ERR, + F2FS_I_SB(dic->inode)->sb->s_id, + __func__, dic->rlen, + PAGE_SIZE << dic->log_cluster_size); + return -EIO; + } + + return 0; +} + +static const struct f2fs_compress_ops f2fs_zstd_ops = { + .init_compress_ctx = zstd_init_compress_ctx, + .destroy_compress_ctx = zstd_destroy_compress_ctx, + .compress_pages = zstd_compress_pages, + .init_decompress_ctx = zstd_init_decompress_ctx, + .destroy_decompress_ctx = zstd_destroy_decompress_ctx, + .decompress_pages = zstd_decompress_pages, +}; +#endif + static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = { #ifdef CONFIG_F2FS_FS_LZO &f2fs_lzo_ops, @@ -302,6 +462,11 @@ static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = { #else NULL, #endif +#ifdef CONFIG_F2FS_FS_ZSTD + &f2fs_zstd_ops, +#else + NULL, +#endif }; bool f2fs_is_compress_backend_ready(struct inode *inode) @@ -334,9 +499,11 @@ static int f2fs_compress_pages(struct compress_ctx *cc) trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx, cc->cluster_size, fi->i_compress_algorithm); - ret = cops->init_compress_ctx(cc); - if (ret) - goto out; + if (cops->init_compress_ctx) { + ret = cops->init_compress_ctx(cc); + if (ret) + goto out; + } max_len = COMPRESS_HEADER_SIZE + cc->clen; cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); @@ -380,21 +547,27 @@ static int f2fs_compress_pages(struct compress_ctx *cc) } cc->cbuf->clen = cpu_to_le32(cc->clen); - cc->cbuf->chksum = cpu_to_le32(0); for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++) cc->cbuf->reserved[i] = cpu_to_le32(0); + nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); + + /* zero out any unused part of the last page */ + memset(&cc->cbuf->cdata[cc->clen], 0, + (nr_cpages * PAGE_SIZE) - (cc->clen + COMPRESS_HEADER_SIZE)); + vunmap(cc->cbuf); vunmap(cc->rbuf); - nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); - for (i = nr_cpages; i < cc->nr_cpages; i++) { f2fs_put_compressed_page(cc->cpages[i]); cc->cpages[i] = NULL; } + if (cops->destroy_compress_ctx) + cops->destroy_compress_ctx(cc); + cc->nr_cpages = nr_cpages; trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, @@ -413,7 +586,8 @@ out_free_cpages: kfree(cc->cpages); cc->cpages = NULL; destroy_compress_ctx: - cops->destroy_compress_ctx(cc); + if (cops->destroy_compress_ctx) + cops->destroy_compress_ctx(cc); out: trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, cc->clen, ret); @@ -447,10 +621,16 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity) goto out_free_dic; } + if (cops->init_decompress_ctx) { + ret = cops->init_decompress_ctx(dic); + if (ret) + goto out_free_dic; + } + dic->rbuf = vmap(dic->tpages, dic->cluster_size, VM_MAP, PAGE_KERNEL); if (!dic->rbuf) { ret = -ENOMEM; - goto out_free_dic; + goto destroy_decompress_ctx; } dic->cbuf = vmap(dic->cpages, dic->nr_cpages, VM_MAP, PAGE_KERNEL_RO); @@ -473,7 +653,12 @@ out_vunmap_cbuf: vunmap(dic->cbuf); out_vunmap_rbuf: vunmap(dic->rbuf); +destroy_decompress_ctx: + if (cops->destroy_decompress_ctx) + cops->destroy_decompress_ctx(dic); out_free_dic: + if (verity) + refcount_set(&dic->ref, dic->nr_cpages); if (!verity) f2fs_decompress_end_io(dic->rpages, dic->cluster_size, ret, false); @@ -532,8 +717,7 @@ static bool __cluster_may_compress(struct compress_ctx *cc) return true; } -/* return # of compressed block addresses */ -static int f2fs_compressed_blocks(struct compress_ctx *cc) +static int __f2fs_cluster_blocks(struct compress_ctx *cc, bool compr) { struct dnode_of_data dn; int ret; @@ -554,10 +738,15 @@ static int f2fs_compressed_blocks(struct compress_ctx *cc) for (i = 1; i < cc->cluster_size; i++) { block_t blkaddr; - blkaddr = datablock_addr(dn.inode, + blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i); - if (blkaddr != NULL_ADDR) - ret++; + if (compr) { + if (__is_valid_data_blkaddr(blkaddr)) + ret++; + } else { + if (blkaddr != NULL_ADDR) + ret++; + } } } fail: @@ -565,6 +754,18 @@ fail: return ret; } +/* return # of compressed blocks in compressed cluster */ +static int f2fs_compressed_blocks(struct compress_ctx *cc) +{ + return __f2fs_cluster_blocks(cc, true); +} + +/* return # of valid blocks in compressed cluster */ +static int f2fs_cluster_blocks(struct compress_ctx *cc, bool compr) +{ + return __f2fs_cluster_blocks(cc, false); +} + int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) { struct compress_ctx cc = { @@ -574,7 +775,7 @@ int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index) .cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size, }; - return f2fs_compressed_blocks(&cc); + return f2fs_cluster_blocks(&cc, false); } static bool cluster_may_compress(struct compress_ctx *cc) @@ -623,7 +824,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, bool prealloc; retry: - ret = f2fs_compressed_blocks(cc); + ret = f2fs_cluster_blocks(cc, false); if (ret <= 0) return ret; @@ -653,7 +854,7 @@ retry: struct bio *bio = NULL; ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size, - &last_block_in_bio, false); + &last_block_in_bio, false, true); f2fs_destroy_compress_ctx(cc); if (ret) goto release_pages; @@ -772,7 +973,6 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, .encrypted_page = NULL, .compressed_page = NULL, .submitted = false, - .need_lock = LOCK_RETRY, .io_type = io_type, .io_wbc = wbc, .encrypted = f2fs_encrypted_file(cc->inode), @@ -785,16 +985,17 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, loff_t psize; int i, err; - set_new_dnode(&dn, cc->inode, NULL, NULL, 0); + if (!f2fs_trylock_op(sbi)) + return -EAGAIN; - f2fs_lock_op(sbi); + set_new_dnode(&dn, cc->inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (err) goto out_unlock_op; for (i = 0; i < cc->cluster_size; i++) { - if (datablock_addr(dn.inode, dn.node_page, + if (data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i) == NULL_ADDR) goto out_put_dnode; } @@ -813,7 +1014,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; cic->inode = inode; - refcount_set(&cic->ref, 1); + refcount_set(&cic->ref, cc->nr_cpages); cic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) << cc->log_cluster_size, GFP_NOFS); if (!cic->rpages) @@ -823,8 +1024,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, for (i = 0; i < cc->nr_cpages; i++) { f2fs_set_compressed_page(cc->cpages[i], inode, - cc->rpages[i + 1]->index, - cic, i ? &cic->ref : NULL); + cc->rpages[i + 1]->index, cic); fio.compressed_page = cc->cpages[i]; if (fio.encrypted) { fio.page = cc->rpages[i + 1]; @@ -843,9 +1043,8 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, for (i = 0; i < cc->cluster_size; i++, dn.ofs_in_node++) { block_t blkaddr; - blkaddr = datablock_addr(dn.inode, dn.node_page, - dn.ofs_in_node); - fio.page = cic->rpages[i]; + blkaddr = f2fs_data_blkaddr(&dn); + fio.page = cc->rpages[i]; fio.old_blkaddr = blkaddr; /* cluster header */ @@ -895,10 +1094,10 @@ unlock_continue: f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); - down_write(&fi->i_sem); + spin_lock(&fi->i_size_lock); if (fi->last_disk_size < psize) fi->last_disk_size = psize; - up_write(&fi->i_sem); + spin_unlock(&fi->i_size_lock); f2fs_put_rpages(cc); f2fs_destroy_compress_ctx(cc); @@ -984,24 +1183,30 @@ retry_write: unlock_page(cc->rpages[i]); ret = 0; } else if (ret == -EAGAIN) { + /* + * for quota file, just redirty left pages to + * avoid deadlock caused by cluster update race + * from foreground operation. + */ + if (IS_NOQUOTA(cc->inode)) { + err = 0; + goto out_err; + } ret = 0; cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, + DEFAULT_IO_TIMEOUT); lock_page(cc->rpages[i]); clear_page_dirty_for_io(cc->rpages[i]); goto retry_write; } err = ret; - goto out_fail; + goto out_err; } *submitted += _submitted; } return 0; - -out_fail: - /* TODO: revoke partially updated block addresses */ - BUG_ON(compr_blocks); out_err: for (++i; i < cc->cluster_size; i++) { if (!cc->rpages[i]) @@ -1069,7 +1274,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->magic = F2FS_COMPRESSED_PAGE_MAGIC; dic->inode = cc->inode; - refcount_set(&dic->ref, 1); + refcount_set(&dic->ref, cc->nr_cpages); dic->cluster_idx = cc->cluster_idx; dic->cluster_size = cc->cluster_size; dic->log_cluster_size = cc->log_cluster_size; @@ -1093,8 +1298,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) goto out_free; f2fs_set_compressed_page(page, cc->inode, - start_idx + i + 1, - dic, i ? &dic->ref : NULL); + start_idx + i + 1, dic); dic->cpages[i] = page; } @@ -1104,20 +1308,16 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) goto out_free; for (i = 0; i < dic->cluster_size; i++) { - if (cc->rpages[i]) + if (cc->rpages[i]) { + dic->tpages[i] = cc->rpages[i]; continue; + } dic->tpages[i] = f2fs_grab_page(); if (!dic->tpages[i]) goto out_free; } - for (i = 0; i < dic->cluster_size; i++) { - if (dic->tpages[i]) - continue; - dic->tpages[i] = cc->rpages[i]; - } - return dic; out_free: @@ -1133,7 +1333,10 @@ void f2fs_free_dic(struct decompress_io_ctx *dic) for (i = 0; i < dic->cluster_size; i++) { if (dic->rpages[i]) continue; - f2fs_put_page(dic->tpages[i], 1); + if (!dic->tpages[i]) + continue; + unlock_page(dic->tpages[i]); + put_page(dic->tpages[i]); } kfree(dic->tpages); } @@ -1162,15 +1365,17 @@ void f2fs_decompress_end_io(struct page **rpages, if (!rpage) continue; - if (err || PageError(rpage)) { - ClearPageUptodate(rpage); - ClearPageError(rpage); - } else { - if (!verity || fsverity_verify_page(rpage)) - SetPageUptodate(rpage); - else - SetPageError(rpage); + if (err || PageError(rpage)) + goto clear_uptodate; + + if (!verity || fsverity_verify_page(rpage)) { + SetPageUptodate(rpage); + goto unlock; } +clear_uptodate: + ClearPageUptodate(rpage); + ClearPageError(rpage); +unlock: unlock_page(rpage); } } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b27b72107911..cdf2f626bea7 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -54,17 +54,13 @@ static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask, return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset); } -struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail) +struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio) { - struct bio *bio; - - if (no_fail) { + if (noio) { /* No failure on bio allocation */ - bio = __f2fs_bio_alloc(GFP_NOIO, npages); - if (!bio) - bio = __f2fs_bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); - return bio; + return __f2fs_bio_alloc(GFP_NOIO, npages); } + if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO); return NULL; @@ -143,6 +139,8 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity) f2fs_decompress_pages(bio, page, verity); continue; } + if (verity) + continue; #endif /* PG_error was set if any post_read step failed */ @@ -191,12 +189,38 @@ static void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size) static void f2fs_verify_bio(struct bio *bio) { - struct page *page = bio_first_page_all(bio); - struct decompress_io_ctx *dic = - (struct decompress_io_ctx *)page_private(page); + struct bio_vec *bv; + struct bvec_iter_all iter_all; - f2fs_verify_pages(dic->rpages, dic->cluster_size); - f2fs_free_dic(dic); + bio_for_each_segment_all(bv, bio, iter_all) { + struct page *page = bv->bv_page; + struct decompress_io_ctx *dic; + + dic = (struct decompress_io_ctx *)page_private(page); + + if (dic) { + if (refcount_dec_not_one(&dic->ref)) + continue; + f2fs_verify_pages(dic->rpages, + dic->cluster_size); + f2fs_free_dic(dic); + continue; + } + + if (bio->bi_status || PageError(page)) + goto clear_uptodate; + + if (fsverity_verify_page(page)) { + SetPageUptodate(page); + goto unlock; + } +clear_uptodate: + ClearPageUptodate(page); + ClearPageError(page); +unlock: + dec_page_count(F2FS_P_SB(page), __read_io_type(page)); + unlock_page(page); + } } #endif @@ -364,9 +388,6 @@ static void f2fs_write_end_io(struct bio *bio) bio_put(bio); } -/* - * Return true, if pre_bio's bdev is same as its target device. - */ struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, block_t blk_addr, struct bio *bio) { @@ -403,6 +424,9 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) return 0; } +/* + * Return true, if pre_bio's bdev is same as its target device. + */ static bool __same_bdev(struct f2fs_sb_info *sbi, block_t blk_addr, struct bio *bio) { @@ -410,9 +434,6 @@ static bool __same_bdev(struct f2fs_sb_info *sbi, return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno; } -/* - * Low-level block read/write IO operations. - */ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) { struct f2fs_sb_info *sbi = fio->sbi; @@ -445,7 +466,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, if (type != DATA && type != NODE) goto submit_io; - if (test_opt(sbi, LFS) && current->plug) + if (f2fs_lfs_mode(sbi) && current->plug) blk_finish_plug(current->plug); if (F2FS_IO_ALIGNED(sbi)) @@ -928,14 +949,15 @@ static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned nr_pages, unsigned op_flag, - pgoff_t first_idx) + pgoff_t first_idx, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; struct bio_post_read_ctx *ctx; unsigned int post_read_steps = 0; - bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false); + bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), + for_write); if (!bio) return ERR_PTR(-ENOMEM); f2fs_target_device(sbi, blkaddr, bio); @@ -970,12 +992,12 @@ static void f2fs_release_read_bio(struct bio *bio) /* This can handle encryption stuffs */ static int f2fs_submit_page_read(struct inode *inode, struct page *page, - block_t blkaddr) + block_t blkaddr, bool for_write) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; - bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index); + bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index, for_write); if (IS_ERR(bio)) return PTR_ERR(bio); @@ -1047,8 +1069,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); for (; count > 0; dn->ofs_in_node++) { - block_t blkaddr = datablock_addr(dn->inode, - dn->node_page, dn->ofs_in_node); + block_t blkaddr = f2fs_data_blkaddr(dn); if (blkaddr == NULL_ADDR) { dn->data_blkaddr = NEW_ADDR; __set_data_blkaddr(dn); @@ -1162,7 +1183,7 @@ got_it: return page; } - err = f2fs_submit_page_read(inode, page, dn.data_blkaddr); + err = f2fs_submit_page_read(inode, page, dn.data_blkaddr, for_write); if (err) goto put_err; return page; @@ -1300,8 +1321,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) if (err) return err; - dn->data_blkaddr = datablock_addr(dn->inode, - dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = f2fs_data_blkaddr(dn); if (dn->data_blkaddr != NULL_ADDR) goto alloc; @@ -1388,13 +1408,9 @@ void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) } /* - * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with - * f2fs_map_blocks structure. - * If original data blocks are allocated, then give them to blockdev. - * Otherwise, - * a. preallocate requested block addresses - * b. do not use extent cache for better performance - * c. give the block addresses to blockdev + * f2fs_map_blocks() tries to find or build mapping relationship which + * maps continuous logical blocks to physical blocks, and return such + * info via f2fs_map_blocks structure. */ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag) @@ -1422,7 +1438,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, end = pgofs + maxblocks; if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) { - if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO && + if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO && map->m_may_create) goto next_dnode; @@ -1467,7 +1483,7 @@ next_dnode: end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: - blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); + blkaddr = f2fs_data_blkaddr(&dn); if (__is_valid_data_blkaddr(blkaddr) && !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { @@ -1477,7 +1493,7 @@ next_block: if (__is_valid_data_blkaddr(blkaddr)) { /* use out-place-update for driect IO under LFS mode */ - if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO && + if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO && map->m_may_create) { err = __allocate_data_block(&dn, map->m_seg_type); if (err) @@ -1980,7 +1996,8 @@ submit_and_realloc: } if (bio == NULL) { bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, - is_readahead ? REQ_RAHEAD : 0, page->index); + is_readahead ? REQ_RAHEAD : 0, page->index, + false); if (IS_ERR(bio)) { ret = PTR_ERR(bio); bio = NULL; @@ -2015,7 +2032,7 @@ out: #ifdef CONFIG_F2FS_FS_COMPRESSION int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, - bool is_readahead) + bool is_readahead, bool for_write) { struct dnode_of_data dn; struct inode *inode = cc->inode; @@ -2031,7 +2048,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); - last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; + last_block_in_file = (f2fs_readpage_limit(inode) + + blocksize - 1) >> blkbits; /* get rid of pages beyond EOF */ for (i = 0; i < cc->cluster_size; i++) { @@ -2067,7 +2085,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, for (i = 1; i < cc->cluster_size; i++) { block_t blkaddr; - blkaddr = datablock_addr(dn.inode, dn.node_page, + blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i); if (!__is_valid_data_blkaddr(blkaddr)) @@ -2096,7 +2114,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, struct page *page = dic->cpages[i]; block_t blkaddr; - blkaddr = datablock_addr(dn.inode, dn.node_page, + blkaddr = data_blkaddr(dn.inode, dn.node_page, dn.ofs_in_node + i + 1); if (bio && !page_is_mergeable(sbi, bio, @@ -2109,7 +2127,7 @@ submit_and_realloc: if (!bio) { bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages, is_readahead ? REQ_RAHEAD : 0, - page->index); + page->index, for_write); if (IS_ERR(bio)) { ret = PTR_ERR(bio); bio = NULL; @@ -2210,7 +2228,7 @@ int f2fs_mpage_readpages(struct address_space *mapping, ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, - is_readahead); + is_readahead, false); f2fs_destroy_compress_ctx(&cc); if (ret) goto set_error_page; @@ -2253,7 +2271,7 @@ next_page: ret = f2fs_read_multi_pages(&cc, &bio, max_nr_pages, &last_block_in_bio, - is_readahead); + is_readahead, false); f2fs_destroy_compress_ctx(&cc); } } @@ -2326,7 +2344,7 @@ retry_encrypt: /* flush pending IOs and wait for a while in the ENOMEM case */ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { f2fs_flush_merged_writes(fio->sbi); - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); gfp_flags |= __GFP_NOFAIL; goto retry_encrypt; } @@ -2397,7 +2415,7 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - if (test_opt(sbi, LFS)) + if (f2fs_lfs_mode(sbi)) return true; if (S_ISDIR(inode->i_mode)) return true; @@ -2647,10 +2665,10 @@ write: if (err) { file_set_keep_isize(inode); } else { - down_write(&F2FS_I(inode)->i_sem); + spin_lock(&F2FS_I(inode)->i_size_lock); if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; - up_write(&F2FS_I(inode)->i_sem); + spin_unlock(&F2FS_I(inode)->i_size_lock); } done: @@ -2917,7 +2935,7 @@ result: if (wbc->sync_mode == WB_SYNC_ALL) { cond_resched(); congestion_wait(BLK_RW_ASYNC, - HZ/50); + DEFAULT_IO_TIMEOUT); goto retry_write; } goto next; @@ -2973,15 +2991,17 @@ next: static inline bool __should_serialize_io(struct inode *inode, struct writeback_control *wbc) { + /* to avoid deadlock in path of data flush */ + if (F2FS_I(inode)->cp_task) + return false; + if (!S_ISREG(inode->i_mode)) return false; - if (f2fs_compressed_file(inode)) - return true; if (IS_NOQUOTA(inode)) return false; - /* to avoid deadlock in path of data flush */ - if (F2FS_I(inode)->cp_task) - return false; + + if (f2fs_compressed_file(inode)) + return true; if (wbc->sync_mode != WB_SYNC_ALL) return true; if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks) @@ -3283,7 +3303,7 @@ repeat: err = -EFSCORRUPTED; goto fail; } - err = f2fs_submit_page_read(inode, page, blkaddr); + err = f2fs_submit_page_read(inode, page, blkaddr, true); if (err) goto fail; @@ -3464,7 +3484,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, rw == WRITE ? get_data_block_dio_write : get_data_block_dio, NULL, f2fs_dio_submit_bio, - DIO_LOCKING | DIO_SKIP_HOLES); + rw == WRITE ? DIO_LOCKING | DIO_SKIP_HOLES : + DIO_SKIP_HOLES); if (do_opu) up_read(&fi->i_gc_rwsem[READ]); @@ -3861,7 +3882,7 @@ void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi) int __init f2fs_init_bio_entry_cache(void) { - bio_entry_slab = f2fs_kmem_cache_create("bio_entry_slab", + bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab", sizeof(struct bio_entry)); if (!bio_entry_slab) return -ENOMEM; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 6b89eae5e4ca..0dbcb0f9c019 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -301,6 +301,9 @@ static int stat_show(struct seq_file *s, void *v) si->ssa_area_segs, si->main_area_segs); seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", si->overp_segs, si->rsvd_segs); + seq_printf(s, "Current Time Sec: %llu / Mounted Time Sec: %llu\n\n", + ktime_get_boottime_seconds(), + SIT_I(si->sbi)->mounted_time); if (test_opt(si->sbi, DISCARD)) seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n", si->utilization, si->valid_count, si->discard_blks); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 27d0dd7a16d6..44bfc464df78 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -471,7 +471,6 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, struct page *dpage) { struct page *page; - int dummy_encrypt = DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(dir)); int err; if (is_inode_flag_set(inode, FI_NEW_INODE)) { @@ -498,8 +497,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, if (err) goto put_error; - if ((IS_ENCRYPTED(dir) || dummy_encrypt) && - f2fs_may_encrypt(inode)) { + if (IS_ENCRYPTED(inode)) { err = fscrypt_inherit_context(dir, inode, page, false); if (err) goto put_error; @@ -850,12 +848,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, 0); set_page_dirty(page); - dir->i_ctime = dir->i_mtime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir, false); - - if (inode) - f2fs_drop_nlink(dir, inode); - if (bit_pos == NR_DENTRY_IN_BLOCK && !f2fs_truncate_hole(dir, page->index, page->index + 1)) { f2fs_clear_page_cache_dirty_tag(page); @@ -867,6 +859,12 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_remove_dirty_inode(dir); } f2fs_put_page(page, 1); + + dir->i_ctime = dir->i_mtime = current_time(dir); + f2fs_mark_inode_dirty_sync(dir, false); + + if (inode) + f2fs_drop_nlink(dir, inode); } bool f2fs_empty_dir(struct inode *dir) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 088c3e7a1080..ba470d5687fe 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -75,7 +75,6 @@ extern const char *f2fs_fault_name[FAULT_MAX]; /* * For mount options */ -#define F2FS_MOUNT_BG_GC 0x00000001 #define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000002 #define F2FS_MOUNT_DISCARD 0x00000004 #define F2FS_MOUNT_NOHEAP 0x00000008 @@ -89,11 +88,8 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_NOBARRIER 0x00000800 #define F2FS_MOUNT_FASTBOOT 0x00001000 #define F2FS_MOUNT_EXTENT_CACHE 0x00002000 -#define F2FS_MOUNT_FORCE_FG_GC 0x00004000 #define F2FS_MOUNT_DATA_FLUSH 0x00008000 #define F2FS_MOUNT_FAULT_INJECTION 0x00010000 -#define F2FS_MOUNT_ADAPTIVE 0x00020000 -#define F2FS_MOUNT_LFS 0x00040000 #define F2FS_MOUNT_USRQUOTA 0x00080000 #define F2FS_MOUNT_GRPQUOTA 0x00100000 #define F2FS_MOUNT_PRJQUOTA 0x00200000 @@ -101,6 +97,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000 #define F2FS_MOUNT_RESERVE_ROOT 0x01000000 #define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000 +#define F2FS_MOUNT_NORECOVERY 0x04000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -139,6 +136,8 @@ struct f2fs_mount_info { int whint_mode; int alloc_mode; /* segment allocation policy */ int fsync_mode; /* fsync policy */ + int fs_mode; /* fs mode: LFS or ADAPTIVE */ + int bggc_mode; /* bggc mode: off, on or sync */ bool test_dummy_encryption; /* test dummy encryption */ block_t unusable_cap; /* Amount of space allowed to be * unusable when disabling checkpoint @@ -332,8 +331,8 @@ struct discard_policy { bool io_aware; /* issue discard in idle time */ bool sync; /* submit discard with REQ_SYNC flag */ bool ordered; /* issue discard by lba order */ + bool timeout; /* discard timeout for put_super */ unsigned int granularity; /* discard granularity */ - int timeout; /* discard timeout for put_super */ }; struct discard_cmd_control { @@ -428,6 +427,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_GET_PIN_FILE _IOR(F2FS_IOCTL_MAGIC, 14, __u32) #define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15) #define F2FS_IOC_RESIZE_FS _IOW(F2FS_IOCTL_MAGIC, 16, __u64) +#define F2FS_IOC_GET_COMPRESS_BLOCKS _IOR(F2FS_IOCTL_MAGIC, 17, __u64) #define F2FS_IOC_GET_VOLUME_NAME FS_IOC_GETFSLABEL #define F2FS_IOC_SET_VOLUME_NAME FS_IOC_SETFSLABEL @@ -560,6 +560,9 @@ enum { #define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */ +/* congestion wait timeout value, default: 20ms */ +#define DEFAULT_IO_TIMEOUT (msecs_to_jiffies(20)) + /* maximum retry quota flush count */ #define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 @@ -676,6 +679,44 @@ enum { MAX_GC_FAILURE }; +/* used for f2fs_inode_info->flags */ +enum { + FI_NEW_INODE, /* indicate newly allocated inode */ + FI_DIRTY_INODE, /* indicate inode is dirty or not */ + FI_AUTO_RECOVER, /* indicate inode is recoverable */ + FI_DIRTY_DIR, /* indicate directory has dirty pages */ + FI_INC_LINK, /* need to increment i_nlink */ + FI_ACL_MODE, /* indicate acl mode */ + FI_NO_ALLOC, /* should not allocate any blocks */ + FI_FREE_NID, /* free allocated nide */ + FI_NO_EXTENT, /* not to use the extent cache */ + FI_INLINE_XATTR, /* used for inline xattr */ + FI_INLINE_DATA, /* used for inline data*/ + FI_INLINE_DENTRY, /* used for inline dentry */ + FI_APPEND_WRITE, /* inode has appended data */ + FI_UPDATE_WRITE, /* inode has in-place-update data */ + FI_NEED_IPU, /* used for ipu per file */ + FI_ATOMIC_FILE, /* indicate atomic file */ + FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */ + FI_VOLATILE_FILE, /* indicate volatile file */ + FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ + FI_DROP_CACHE, /* drop dirty page cache */ + FI_DATA_EXIST, /* indicate data exists */ + FI_INLINE_DOTS, /* indicate inline dot dentries */ + FI_DO_DEFRAG, /* indicate defragment is running */ + FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ + FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ + FI_HOT_DATA, /* indicate file is hot */ + FI_EXTRA_ATTR, /* indicate file has extra attribute */ + FI_PROJ_INHERIT, /* indicate file inherits projectid */ + FI_PIN_FILE, /* indicate file should not be gced */ + FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ + FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ + FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ + FI_MMAP_FILE, /* indicate file was mmapped */ + FI_MAX, /* max flag, never be used */ +}; + struct f2fs_inode_info { struct inode vfs_inode; /* serve a vfs inode */ unsigned long i_flags; /* keep an inode flags for ioctl */ @@ -688,7 +729,7 @@ struct f2fs_inode_info { umode_t i_acl_mode; /* keep file acl mode temporarily */ /* Use below internally in f2fs*/ - unsigned long flags; /* use to pass per-file flags */ + unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */ struct rw_semaphore i_sem; /* protect fi info */ atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ @@ -697,6 +738,7 @@ struct f2fs_inode_info { struct task_struct *cp_task; /* separate cp/wb IO stats*/ nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ + spinlock_t i_size_lock; /* protect last_disk_size */ #ifdef CONFIG_QUOTA struct dquot *i_dquot[MAXQUOTAS]; @@ -1173,6 +1215,20 @@ enum { }; enum { + BGGC_MODE_ON, /* background gc is on */ + BGGC_MODE_OFF, /* background gc is off */ + BGGC_MODE_SYNC, /* + * background gc is on, migrating blocks + * like foreground gc + */ +}; + +enum { + FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */ + FS_MODE_LFS, /* use lfs allocation only */ +}; + +enum { WHINT_MODE_OFF, /* not pass down write hints */ WHINT_MODE_USER, /* try to pass down hints given by users */ WHINT_MODE_FS, /* pass down hints with F2FS policy */ @@ -1212,13 +1268,13 @@ enum fsync_mode { enum compress_algorithm_type { COMPRESS_LZO, COMPRESS_LZ4, + COMPRESS_ZSTD, COMPRESS_MAX, }; -#define COMPRESS_DATA_RESERVED_SIZE 4 +#define COMPRESS_DATA_RESERVED_SIZE 5 struct compress_data { __le32 clen; /* compressed data size */ - __le32 chksum; /* checksum of compressed data */ __le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */ u8 cdata[]; /* compressed data */ }; @@ -1242,6 +1298,7 @@ struct compress_ctx { size_t rlen; /* valid data length in rbuf */ size_t clen; /* valid data length in cbuf */ void *private; /* payload buffer for specified compression algorithm */ + void *private2; /* extra payload buffer */ }; /* compress context for write IO path */ @@ -1271,11 +1328,14 @@ struct decompress_io_ctx { size_t clen; /* valid data length in cbuf */ refcount_t ref; /* referrence count of compressed page */ bool failed; /* indicate IO error during decompression */ + void *private; /* payload buffer for specified decompression algorithm */ + void *private2; /* extra payload buffer */ }; #define NULL_CLUSTER ((unsigned int)(~0)) #define MIN_COMPRESS_LOG_SIZE 2 #define MAX_COMPRESS_LOG_SIZE 8 +#define MAX_COMPRESS_WINDOW_SIZE ((PAGE_SIZE) << MAX_COMPRESS_LOG_SIZE) struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ @@ -1471,6 +1531,9 @@ struct f2fs_sb_info { __u32 s_chksum_seed; struct workqueue_struct *post_read_wq; /* post read workqueue */ + + struct kmem_cache *inline_xattr_slab; /* inline xattr entry */ + unsigned int inline_xattr_slab_size; /* default inline xattr slab size */ }; struct f2fs_private_dio { @@ -2211,7 +2274,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, dquot_free_inode(inode); } else { if (unlikely(inode->i_blocks == 0)) { - f2fs_warn(sbi, "Inconsistent i_blocks, ino:%lu, iblocks:%llu", + f2fs_warn(sbi, "dec_valid_node_count: inconsistent i_blocks, ino:%lu, iblocks:%llu", inode->i_ino, (unsigned long long)inode->i_blocks); set_sbi_flag(sbi, SBI_NEED_FSCK); @@ -2379,7 +2442,7 @@ static inline __le32 *blkaddr_in_node(struct f2fs_node *node) } static inline int f2fs_has_extra_attr(struct inode *inode); -static inline block_t datablock_addr(struct inode *inode, +static inline block_t data_blkaddr(struct inode *inode, struct page *node_page, unsigned int offset) { struct f2fs_node *raw_node; @@ -2389,9 +2452,9 @@ static inline block_t datablock_addr(struct inode *inode, raw_node = F2FS_NODE(node_page); - /* from GC path only */ if (is_inode) { if (!inode) + /* from GC path only */ base = offset_in_addr(&raw_node->i); else if (f2fs_has_extra_attr(inode)) base = get_extra_isize(inode); @@ -2401,6 +2464,11 @@ static inline block_t datablock_addr(struct inode *inode, return le32_to_cpu(addr_array[base + offset]); } +static inline block_t f2fs_data_blkaddr(struct dnode_of_data *dn) +{ + return data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node); +} + static inline int f2fs_test_bit(unsigned int nr, char *addr) { int mask; @@ -2498,43 +2566,6 @@ static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) return flags & F2FS_OTHER_FLMASK; } -/* used for f2fs_inode_info->flags */ -enum { - FI_NEW_INODE, /* indicate newly allocated inode */ - FI_DIRTY_INODE, /* indicate inode is dirty or not */ - FI_AUTO_RECOVER, /* indicate inode is recoverable */ - FI_DIRTY_DIR, /* indicate directory has dirty pages */ - FI_INC_LINK, /* need to increment i_nlink */ - FI_ACL_MODE, /* indicate acl mode */ - FI_NO_ALLOC, /* should not allocate any blocks */ - FI_FREE_NID, /* free allocated nide */ - FI_NO_EXTENT, /* not to use the extent cache */ - FI_INLINE_XATTR, /* used for inline xattr */ - FI_INLINE_DATA, /* used for inline data*/ - FI_INLINE_DENTRY, /* used for inline dentry */ - FI_APPEND_WRITE, /* inode has appended data */ - FI_UPDATE_WRITE, /* inode has in-place-update data */ - FI_NEED_IPU, /* used for ipu per file */ - FI_ATOMIC_FILE, /* indicate atomic file */ - FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */ - FI_VOLATILE_FILE, /* indicate volatile file */ - FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ - FI_DROP_CACHE, /* drop dirty page cache */ - FI_DATA_EXIST, /* indicate data exists */ - FI_INLINE_DOTS, /* indicate inline dot dentries */ - FI_DO_DEFRAG, /* indicate defragment is running */ - FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ - FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ - FI_HOT_DATA, /* indicate file is hot */ - FI_EXTRA_ATTR, /* indicate file has extra attribute */ - FI_PROJ_INHERIT, /* indicate file inherits projectid */ - FI_PIN_FILE, /* indicate file should not be gced */ - FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */ - FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ - FI_COMPRESSED_FILE, /* indicate file's data can be compressed */ - FI_MMAP_FILE, /* indicate file was mmapped */ -}; - static inline void __mark_inode_dirty_flag(struct inode *inode, int flag, bool set) { @@ -2549,27 +2580,24 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, case FI_DATA_EXIST: case FI_INLINE_DOTS: case FI_PIN_FILE: - case FI_COMPRESSED_FILE: f2fs_mark_inode_dirty_sync(inode, true); } } static inline void set_inode_flag(struct inode *inode, int flag) { - if (!test_bit(flag, &F2FS_I(inode)->flags)) - set_bit(flag, &F2FS_I(inode)->flags); + test_and_set_bit(flag, F2FS_I(inode)->flags); __mark_inode_dirty_flag(inode, flag, true); } static inline int is_inode_flag_set(struct inode *inode, int flag) { - return test_bit(flag, &F2FS_I(inode)->flags); + return test_bit(flag, F2FS_I(inode)->flags); } static inline void clear_inode_flag(struct inode *inode, int flag) { - if (test_bit(flag, &F2FS_I(inode)->flags)) - clear_bit(flag, &F2FS_I(inode)->flags); + test_and_clear_bit(flag, F2FS_I(inode)->flags); __mark_inode_dirty_flag(inode, flag, false); } @@ -2660,19 +2688,19 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) struct f2fs_inode_info *fi = F2FS_I(inode); if (ri->i_inline & F2FS_INLINE_XATTR) - set_bit(FI_INLINE_XATTR, &fi->flags); + set_bit(FI_INLINE_XATTR, fi->flags); if (ri->i_inline & F2FS_INLINE_DATA) - set_bit(FI_INLINE_DATA, &fi->flags); + set_bit(FI_INLINE_DATA, fi->flags); if (ri->i_inline & F2FS_INLINE_DENTRY) - set_bit(FI_INLINE_DENTRY, &fi->flags); + set_bit(FI_INLINE_DENTRY, fi->flags); if (ri->i_inline & F2FS_DATA_EXIST) - set_bit(FI_DATA_EXIST, &fi->flags); + set_bit(FI_DATA_EXIST, fi->flags); if (ri->i_inline & F2FS_INLINE_DOTS) - set_bit(FI_INLINE_DOTS, &fi->flags); + set_bit(FI_INLINE_DOTS, fi->flags); if (ri->i_inline & F2FS_EXTRA_ATTR) - set_bit(FI_EXTRA_ATTR, &fi->flags); + set_bit(FI_EXTRA_ATTR, fi->flags); if (ri->i_inline & F2FS_PIN_FILE) - set_bit(FI_PIN_FILE, &fi->flags); + set_bit(FI_PIN_FILE, fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) @@ -2857,9 +2885,9 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) if (!f2fs_is_time_consistent(inode)) return false; - down_read(&F2FS_I(inode)->i_sem); + spin_lock(&F2FS_I(inode)->i_size_lock); ret = F2FS_I(inode)->last_disk_size == i_size_read(inode); - up_read(&F2FS_I(inode)->i_sem); + spin_unlock(&F2FS_I(inode)->i_size_lock); return ret; } @@ -3213,7 +3241,7 @@ void f2fs_drop_inmem_pages(struct inode *inode); void f2fs_drop_inmem_page(struct inode *inode, struct page *page); int f2fs_commit_inmem_pages(struct inode *inode); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); -void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg); int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino); int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi); int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); @@ -3309,7 +3337,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); void f2fs_update_dirty_page(struct inode *inode, struct page *page); void f2fs_remove_dirty_inode(struct inode *inode); int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); -void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi); +void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type); int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); int __init f2fs_create_checkpoint_caches(void); @@ -3320,7 +3348,7 @@ void f2fs_destroy_checkpoint_caches(void); */ int __init f2fs_init_bioset(void); void f2fs_destroy_bioset(void); -struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail); +struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio); int f2fs_init_bio_entry_cache(void); void f2fs_destroy_bio_entry_cache(void); void f2fs_submit_bio(struct f2fs_sb_info *sbi, @@ -3776,7 +3804,7 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index); int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, - bool is_readahead); + bool is_readahead, bool for_write); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); void f2fs_free_dic(struct decompress_io_ctx *dic); void f2fs_decompress_end_io(struct page **rpages, @@ -3813,6 +3841,7 @@ static inline void set_compress_context(struct inode *inode) F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; set_inode_flag(inode, FI_COMPRESSED_FILE); stat_inc_compr_inode(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline u64 f2fs_disable_compressed_file(struct inode *inode) @@ -3821,12 +3850,17 @@ static inline u64 f2fs_disable_compressed_file(struct inode *inode) if (!f2fs_compressed_file(inode)) return 0; - if (fi->i_compr_blocks) - return fi->i_compr_blocks; + if (S_ISREG(inode->i_mode)) { + if (get_dirty_pages(inode)) + return 1; + if (fi->i_compr_blocks) + return fi->i_compr_blocks; + } fi->i_flags &= ~F2FS_COMPR_FL; - clear_inode_flag(inode, FI_COMPRESSED_FILE); stat_dec_compr_inode(inode); + clear_inode_flag(inode, FI_COMPRESSED_FILE); + f2fs_mark_inode_dirty_sync(inode, true); return 0; } @@ -3903,31 +3937,25 @@ static inline bool f2fs_hw_is_readonly(struct f2fs_sb_info *sbi) return false; } - -static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) +static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) { - clear_opt(sbi, ADAPTIVE); - clear_opt(sbi, LFS); - - switch (mt) { - case F2FS_MOUNT_ADAPTIVE: - set_opt(sbi, ADAPTIVE); - break; - case F2FS_MOUNT_LFS: - set_opt(sbi, LFS); - break; - } + return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS; } -static inline bool f2fs_may_encrypt(struct inode *inode) +static inline bool f2fs_may_encrypt(struct inode *dir, struct inode *inode) { #ifdef CONFIG_FS_ENCRYPTION + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); umode_t mode = inode->i_mode; - return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); -#else - return false; + /* + * If the directory encrypted or dummy encryption enabled, + * then we should encrypt the inode. + */ + if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) + return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); #endif + return false; } static inline bool f2fs_may_compress(struct inode *inode) @@ -3971,7 +3999,7 @@ static inline int allow_outplace_dio(struct inode *inode, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int rw = iov_iter_rw(iter); - return (test_opt(sbi, LFS) && (rw == WRITE) && + return (f2fs_lfs_mode(sbi) && (rw == WRITE) && !block_unaligned_IO(inode, iocb, iter)); } @@ -3993,7 +4021,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, */ if (f2fs_sb_has_blkzoned(sbi)) return true; - if (test_opt(sbi, LFS) && (rw == WRITE)) { + if (f2fs_lfs_mode(sbi) && (rw == WRITE)) { if (block_unaligned_IO(inode, iocb, iter)) return true; if (F2FS_IO_ALIGNED(sbi)) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 351762f77840..6ab8f621a3c5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -106,13 +106,20 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) err = f2fs_get_block(&dn, page->index); f2fs_put_dnode(&dn); __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); - if (err) { - unlock_page(page); - goto out_sem; - } } - /* fill the page */ +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (!need_alloc) { + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + f2fs_put_dnode(&dn); + } +#endif + if (err) { + unlock_page(page); + goto out_sem; + } + f2fs_wait_on_page_writeback(page, DATA, false, true); /* wait for GCed page writeback via META_MAPPING */ @@ -448,8 +455,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; - blkaddr = datablock_addr(dn.inode, - dn.node_page, dn.ofs_in_node); + blkaddr = f2fs_data_blkaddr(&dn); if (__is_valid_data_blkaddr(blkaddr) && !f2fs_is_valid_blkaddr(F2FS_I_SB(inode), @@ -793,6 +799,8 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, } flags = fi->i_flags; + if (flags & F2FS_COMPR_FL) + stat->attributes |= STATX_ATTR_COMPRESSED; if (flags & F2FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (IS_ENCRYPTED(inode)) @@ -804,7 +812,8 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, if (IS_VERITY(inode)) stat->attributes |= STATX_ATTR_VERITY; - stat->attributes_mask |= (STATX_ATTR_APPEND | + stat->attributes_mask |= (STATX_ATTR_COMPRESSED | + STATX_ATTR_APPEND | STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP | @@ -929,10 +938,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; - down_write(&F2FS_I(inode)->i_sem); + spin_lock(&F2FS_I(inode)->i_size_lock); inode->i_mtime = inode->i_ctime = current_time(inode); F2FS_I(inode)->last_disk_size = i_size_read(inode); - up_write(&F2FS_I(inode)->i_sem); + spin_unlock(&F2FS_I(inode)->i_size_lock); } __setattr_copy(inode, attr); @@ -1109,8 +1118,7 @@ next_dnode: done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - dn.ofs_in_node, len); for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { - *blkaddr = datablock_addr(dn.inode, - dn.node_page, dn.ofs_in_node); + *blkaddr = f2fs_data_blkaddr(&dn); if (__is_valid_data_blkaddr(*blkaddr) && !f2fs_is_valid_blkaddr(sbi, *blkaddr, @@ -1121,7 +1129,7 @@ next_dnode: if (!f2fs_is_checkpointed_data(sbi, *blkaddr)) { - if (test_opt(sbi, LFS)) { + if (f2fs_lfs_mode(sbi)) { f2fs_put_dnode(&dn); return -EOPNOTSUPP; } @@ -1199,8 +1207,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, ADDRS_PER_PAGE(dn.node_page, dst_inode) - dn.ofs_in_node, len - i); do { - dn.data_blkaddr = datablock_addr(dn.inode, - dn.node_page, dn.ofs_in_node); + dn.data_blkaddr = f2fs_data_blkaddr(&dn); f2fs_truncate_data_blocks_range(&dn, 1); if (do_replace[i]) { @@ -1376,8 +1383,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, int ret; for (; index < end; index++, dn->ofs_in_node++) { - if (datablock_addr(dn->inode, dn->node_page, - dn->ofs_in_node) == NULL_ADDR) + if (f2fs_data_blkaddr(dn) == NULL_ADDR) count++; } @@ -1388,8 +1394,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, dn->ofs_in_node = ofs_in_node; for (index = start; index < end; index++, dn->ofs_in_node++) { - dn->data_blkaddr = datablock_addr(dn->inode, - dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = f2fs_data_blkaddr(dn); /* * f2fs_reserve_new_blocks will not guarantee entire block * allocation. @@ -1787,12 +1792,15 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id) static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) { struct f2fs_inode_info *fi = F2FS_I(inode); + u32 masked_flags = fi->i_flags & mask; + + f2fs_bug_on(F2FS_I_SB(inode), (iflags & ~mask)); /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) return -EPERM; - if ((iflags ^ fi->i_flags) & F2FS_CASEFOLD_FL) { + if ((iflags ^ masked_flags) & F2FS_CASEFOLD_FL) { if (!f2fs_sb_has_casefold(F2FS_I_SB(inode))) return -EOPNOTSUPP; if (!f2fs_empty_dir(inode)) @@ -1806,27 +1814,22 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) return -EINVAL; } - if ((iflags ^ fi->i_flags) & F2FS_COMPR_FL) { - if (S_ISREG(inode->i_mode) && - (fi->i_flags & F2FS_COMPR_FL || i_size_read(inode) || - F2FS_HAS_BLOCKS(inode))) - return -EINVAL; + if ((iflags ^ masked_flags) & F2FS_COMPR_FL) { + if (masked_flags & F2FS_COMPR_FL) { + if (f2fs_disable_compressed_file(inode)) + return -EINVAL; + } if (iflags & F2FS_NOCOMP_FL) return -EINVAL; if (iflags & F2FS_COMPR_FL) { - int err = f2fs_convert_inline_inode(inode); - - if (err) - return err; - if (!f2fs_may_compress(inode)) return -EINVAL; set_compress_context(inode); } } - if ((iflags ^ fi->i_flags) & F2FS_NOCOMP_FL) { - if (fi->i_flags & F2FS_COMPR_FL) + if ((iflags ^ masked_flags) & F2FS_NOCOMP_FL) { + if (masked_flags & F2FS_COMPR_FL) return -EINVAL; } @@ -3401,6 +3404,21 @@ out: return err; } +static int f2fs_get_compress_blocks(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u64 blocks; + + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return -EOPNOTSUPP; + + if (!f2fs_compressed_file(inode)) + return -EINVAL; + + blocks = F2FS_I(inode)->i_compr_blocks; + return put_user(blocks, (u64 __user *)arg); +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp))))) @@ -3481,6 +3499,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_get_volume_name(filp, arg); case F2FS_IOC_SET_VOLUME_NAME: return f2fs_set_volume_name(filp, arg); + case F2FS_IOC_GET_COMPRESS_BLOCKS: + return f2fs_get_compress_blocks(filp, arg); default: return -ENOTTY; } @@ -3508,8 +3528,10 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } - if (!f2fs_is_compress_backend_ready(inode)) - return -EOPNOTSUPP; + if (!f2fs_is_compress_backend_ready(inode)) { + ret = -EOPNOTSUPP; + goto out; + } if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock(inode)) { @@ -3639,6 +3661,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC_MEASURE_VERITY: case F2FS_IOC_GET_VOLUME_NAME: case F2FS_IOC_SET_VOLUME_NAME: + case F2FS_IOC_GET_COMPRESS_BLOCKS: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index db8725d473b5..26248c8936db 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -31,6 +31,8 @@ static int gc_thread_func(void *data) set_freezable(); do { + bool sync_mode; + wait_event_interruptible_timeout(*wq, kthread_should_stop() || freezing(current) || gc_th->gc_wake, @@ -101,15 +103,17 @@ static int gc_thread_func(void *data) do_gc: stat_inc_bggc_count(sbi->stat_info); + sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC; + /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO)) + if (f2fs_gc(sbi, sync_mode, true, NULL_SEGNO)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, prefree_segments(sbi), free_segments(sbi)); /* balancing f2fs's metadata periodically */ - f2fs_balance_fs_bg(sbi); + f2fs_balance_fs_bg(sbi, true); next: sb_end_write(sbi->sb); @@ -192,7 +196,10 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->ofs_unit = sbi->segs_per_sec; } - /* we need to check every dirty segments in the FG_GC case */ + /* + * adjust candidates range, should select all dirty segments for + * foreground GC and urgent GC cases. + */ if (gc_type != FG_GC && (sbi->gc_mode != GC_URGENT) && p->max_search > sbi->max_victim_search) @@ -634,7 +641,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, } *nofs = ofs_of_node(node_page); - source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node); + source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); if (source_blkaddr != blkaddr) { @@ -762,7 +769,7 @@ static int move_data_block(struct inode *inode, block_t bidx, struct page *page, *mpage; block_t newaddr; int err = 0; - bool lfs_mode = test_opt(fio.sbi, LFS); + bool lfs_mode = f2fs_lfs_mode(fio.sbi); /* do not read out */ page = f2fs_grab_cache_page(inode->i_mapping, bidx, false); @@ -970,7 +977,8 @@ retry: if (err) { clear_cold_data(page); if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, + DEFAULT_IO_TIMEOUT); goto retry; } if (is_dirty) @@ -1018,8 +1026,8 @@ next_step: * race condition along with SSR block allocation. */ if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) || - get_valid_blocks(sbi, segno, false) == - sbi->blocks_per_seg) + get_valid_blocks(sbi, segno, true) == + BLKS_PER_SEC(sbi)) return submitted; if (check_valid_map(sbi, segno, off) == 0) @@ -1203,7 +1211,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, if (get_valid_blocks(sbi, segno, false) == 0) goto freed; - if (__is_large_section(sbi) && + if (gc_type == BG_GC && __is_large_section(sbi) && migrated >= sbi->migration_granularity) goto skip; if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi))) @@ -1233,12 +1241,12 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, segno, gc_type); stat_inc_seg_count(sbi, type, gc_type); + migrated++; freed: if (gc_type == FG_GC && get_valid_blocks(sbi, segno, false) == 0) seg_freed++; - migrated++; if (__is_large_section(sbi) && segno + 1 < end_segno) sbi->next_victim_seg[gc_type] = segno + 1; @@ -1434,12 +1442,19 @@ static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start, static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) { struct f2fs_super_block *raw_sb = F2FS_RAW_SUPER(sbi); - int section_count = le32_to_cpu(raw_sb->section_count); - int segment_count = le32_to_cpu(raw_sb->segment_count); - int segment_count_main = le32_to_cpu(raw_sb->segment_count_main); - long long block_count = le64_to_cpu(raw_sb->block_count); + int section_count; + int segment_count; + int segment_count_main; + long long block_count; int segs = secs * sbi->segs_per_sec; + down_write(&sbi->sb_lock); + + section_count = le32_to_cpu(raw_sb->section_count); + segment_count = le32_to_cpu(raw_sb->segment_count); + segment_count_main = le32_to_cpu(raw_sb->segment_count_main); + block_count = le64_to_cpu(raw_sb->block_count); + raw_sb->section_count = cpu_to_le32(section_count + secs); raw_sb->segment_count = cpu_to_le32(segment_count + segs); raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs); @@ -1453,6 +1468,8 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs) raw_sb->devs[last_dev].total_segments = cpu_to_le32(dev_segs + segs); } + + up_write(&sbi->sb_lock); } static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) @@ -1570,11 +1587,17 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) goto out; } + mutex_lock(&sbi->cp_mutex); update_fs_metadata(sbi, -secs); clear_sbi_flag(sbi, SBI_IS_RESIZEFS); + set_sbi_flag(sbi, SBI_IS_DIRTY); + mutex_unlock(&sbi->cp_mutex); + err = f2fs_sync_fs(sbi->sb, 1); if (err) { + mutex_lock(&sbi->cp_mutex); update_fs_metadata(sbi, secs); + mutex_unlock(&sbi->cp_mutex); update_sb_metadata(sbi, secs); f2fs_commit_super(sbi, false); } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 78c3f1d70f1d..44582a4db513 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -291,13 +291,30 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) fi->i_flags & F2FS_COMPR_FL && F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_log_cluster_size)) { - if (ri->i_compress_algorithm >= COMPRESS_MAX) + if (ri->i_compress_algorithm >= COMPRESS_MAX) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported " + "compress algorithm: %u, run fsck to fix", + __func__, inode->i_ino, + ri->i_compress_algorithm); return false; - if (le64_to_cpu(ri->i_compr_blocks) > inode->i_blocks) + } + if (le64_to_cpu(ri->i_compr_blocks) > + SECTOR_TO_BLOCK(inode->i_blocks)) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has inconsistent " + "i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix", + __func__, inode->i_ino, + le64_to_cpu(ri->i_compr_blocks), + SECTOR_TO_BLOCK(inode->i_blocks)); return false; + } if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE || - ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) + ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported " + "log cluster size: %u, run fsck to fix", + __func__, inode->i_ino, + ri->i_log_cluster_size); return false; + } } return true; @@ -345,7 +362,7 @@ static int do_read_inode(struct inode *inode) fi->i_flags = le32_to_cpu(ri->i_flags); if (S_ISREG(inode->i_mode)) fi->i_flags &= ~F2FS_PROJINHERIT_FL; - fi->flags = 0; + bitmap_zero(fi->flags, FI_MAX); fi->i_advise = ri->i_advise; fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; @@ -518,7 +535,7 @@ retry: inode = f2fs_iget(sb, ino); if (IS_ERR(inode)) { if (PTR_ERR(inode) == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); goto retry; } } @@ -759,7 +776,7 @@ no_delete: else f2fs_inode_synced(inode); - /* ino == 0, if f2fs_new_inode() was failed t*/ + /* for the case f2fs_new_inode() was failed, .i_ino is zero, skip it */ if (inode->i_ino) invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 2aa035422c0f..f54119da2217 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -75,9 +75,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) set_inode_flag(inode, FI_NEW_INODE); - /* If the directory encrypted, then we should encrypt the inode. */ - if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) && - f2fs_may_encrypt(inode)) + if (f2fs_may_encrypt(dir, inode)) f2fs_set_encrypted_inode(inode); if (f2fs_sb_has_extra_attr(sbi)) { @@ -177,7 +175,7 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub) } /* - * Set multimedia files as cold files for hot/cold data separation + * Set file's temperature for hot/cold data separation */ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode, const unsigned char *name) @@ -876,12 +874,6 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) { - int err = fscrypt_get_encryption_info(dir); - if (err) - return err; - } - return __f2fs_tmpfile(dir, dentry, mode, NULL); } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9d02cdcdbb07..ecbd6bd14a49 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -510,9 +510,6 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) return nr - nr_shrink; } -/* - * This function always returns success - */ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) { @@ -716,8 +713,7 @@ got: /* * Caller should call f2fs_put_dnode(dn). * Also, it should grab and release a rwsem by calling f2fs_lock_op() and - * f2fs_unlock_op() only if ro is not set RDONLY_NODE. - * In the case of RDONLY_NODE, we don't need to care about mutex. + * f2fs_unlock_op() only if mode is set with ALLOC_NODE. */ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) { @@ -809,8 +805,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) dn->nid = nids[level]; dn->ofs_in_node = offset[level]; dn->node_page = npage[level]; - dn->data_blkaddr = datablock_addr(dn->inode, - dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = f2fs_data_blkaddr(dn); return 0; release_pages: @@ -1188,8 +1183,9 @@ int f2fs_remove_inode_page(struct inode *inode) } if (unlikely(inode->i_blocks != 0 && inode->i_blocks != 8)) { - f2fs_warn(F2FS_I_SB(inode), "Inconsistent i_blocks, ino:%lu, iblocks:%llu", - inode->i_ino, (unsigned long long)inode->i_blocks); + f2fs_warn(F2FS_I_SB(inode), + "f2fs_remove_inode_page: inconsistent i_blocks, ino:%lu, iblocks:%llu", + inode->i_ino, (unsigned long long)inode->i_blocks); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); } @@ -1562,15 +1558,16 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, if (atomic && !test_opt(sbi, NOBARRIER)) fio.op_flags |= REQ_PREFLUSH | REQ_FUA; - set_page_writeback(page); - ClearPageError(page); - + /* should add to global list before clearing PAGECACHE status */ if (f2fs_in_warm_node_list(sbi, page)) { seq = f2fs_add_fsync_node_entry(sbi, page); if (seq_id) *seq_id = seq; } + set_page_writeback(page); + ClearPageError(page); + fio.old_blkaddr = ni.blk_addr; f2fs_do_write_node_page(nid, &fio); set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); @@ -1979,7 +1976,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, goto skip_write; /* balancing f2fs's metadata in background */ - f2fs_balance_fs_bg(sbi); + f2fs_balance_fs_bg(sbi, true); /* collect a number of dirty node pages and write together */ if (wbc->sync_mode != WB_SYNC_ALL && @@ -2602,7 +2599,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) retry: ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); if (!ipage) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); goto retry; } @@ -3193,22 +3190,22 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) int __init f2fs_create_node_manager_caches(void) { - nat_entry_slab = f2fs_kmem_cache_create("nat_entry", + nat_entry_slab = f2fs_kmem_cache_create("f2fs_nat_entry", sizeof(struct nat_entry)); if (!nat_entry_slab) goto fail; - free_nid_slab = f2fs_kmem_cache_create("free_nid", + free_nid_slab = f2fs_kmem_cache_create("f2fs_free_nid", sizeof(struct free_nid)); if (!free_nid_slab) goto destroy_nat_entry; - nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set", + nat_entry_set_slab = f2fs_kmem_cache_create("f2fs_nat_entry_set", sizeof(struct nat_entry_set)); if (!nat_entry_set_slab) goto destroy_free_nid; - fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry", + fsync_node_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_node_entry", sizeof(struct fsync_node_entry)); if (!fsync_node_entry_slab) goto destroy_nat_entry_set; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 763d5c0951d1..dd804c07eeb0 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -496,8 +496,7 @@ out: return 0; truncate_out: - if (datablock_addr(tdn.inode, tdn.node_page, - tdn.ofs_in_node) == blkaddr) + if (f2fs_data_blkaddr(&tdn) == blkaddr) f2fs_truncate_data_blocks_range(&tdn, 1); if (dn->inode->i_ino == nid && !dn->inode_page_locked) unlock_page(dn->inode_page); @@ -535,7 +534,7 @@ retry_dn: err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); goto retry_dn; } goto out; @@ -560,8 +559,8 @@ retry_dn: for (; start < end; start++, dn.ofs_in_node++) { block_t src, dest; - src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); - dest = datablock_addr(dn.inode, page, dn.ofs_in_node); + src = f2fs_data_blkaddr(&dn); + dest = data_blkaddr(dn.inode, page, dn.ofs_in_node); if (__is_valid_data_blkaddr(src) && !f2fs_is_valid_blkaddr(sbi, src, META_POR)) { @@ -618,7 +617,8 @@ retry_prev: err = check_index_in_prev_nodes(sbi, dest, &dn); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, + DEFAULT_IO_TIMEOUT); goto retry_prev; } goto err; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index cf0eb002cfd4..b7a9421472a7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -172,7 +172,7 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi) int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); - if (test_opt(sbi, LFS)) + if (f2fs_lfs_mode(sbi)) return false; if (sbi->gc_mode == GC_URGENT) return true; @@ -245,7 +245,8 @@ retry: LOOKUP_NODE); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, + DEFAULT_IO_TIMEOUT); cond_resched(); goto retry; } @@ -312,7 +313,7 @@ next: skip: iput(inode); } - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); cond_resched(); if (gc_failure) { if (++looped >= count) @@ -415,7 +416,8 @@ retry: err = f2fs_do_write_data_page(&fio); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, + DEFAULT_IO_TIMEOUT); cond_resched(); goto retry; } @@ -494,7 +496,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) /* balance_fs_bg is able to be pending */ if (need && excess_cached_nats(sbi)) - f2fs_balance_fs_bg(sbi); + f2fs_balance_fs_bg(sbi, false); if (!f2fs_is_checkpoint_ready(sbi)) return; @@ -509,7 +511,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) } } -void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) { if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return; @@ -538,7 +540,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) excess_dirty_nats(sbi) || excess_dirty_nodes(sbi) || f2fs_time_over(sbi, CP_TIME)) { - if (test_opt(sbi, DATA_FLUSH)) { + if (test_opt(sbi, DATA_FLUSH) && from_bg) { struct blk_plug plug; mutex_lock(&sbi->flush_lock); @@ -1078,7 +1080,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; dpolicy->io_aware_gran = MAX_PLIST_NUM; - dpolicy->timeout = 0; + dpolicy->timeout = false; if (discard_type == DPOLICY_BG) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; @@ -1103,6 +1105,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->io_aware = false; /* we need to issue all to keep CP_TRIMMED_FLAG */ dpolicy->granularity = 1; + dpolicy->timeout = true; } } @@ -1471,12 +1474,12 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, int i, issued = 0; bool io_interrupted = false; - if (dpolicy->timeout != 0) - f2fs_update_time(sbi, dpolicy->timeout); + if (dpolicy->timeout) + f2fs_update_time(sbi, UMOUNT_DISCARD_TIMEOUT); for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { - if (dpolicy->timeout != 0 && - f2fs_time_over(sbi, dpolicy->timeout)) + if (dpolicy->timeout && + f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT)) break; if (i + 1 < dpolicy->granularity) @@ -1497,8 +1500,8 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); - if (dpolicy->timeout != 0 && - f2fs_time_over(sbi, dpolicy->timeout)) + if (dpolicy->timeout && + f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT)) break; if (dpolicy->io_aware && i < dpolicy->io_aware_gran && @@ -1677,7 +1680,6 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); - dpolicy.timeout = UMOUNT_DISCARD_TIMEOUT; __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); @@ -1940,7 +1942,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason & CP_DISCARD); - bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi); + bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi); mutex_lock(&dirty_i->seglist_lock); @@ -1972,7 +1974,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, (end - 1) <= cpc->trim_end) continue; - if (!test_opt(sbi, LFS) || !__is_large_section(sbi)) { + if (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi)) { f2fs_issue_discard(sbi, START_BLOCK(sbi, start), (end - start) << sbi->log_blocks_per_seg); continue; @@ -2801,7 +2803,7 @@ next: blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); trimmed += __wait_all_discard_cmd(sbi, NULL); - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); goto next; } skip: @@ -2830,7 +2832,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) struct discard_policy dpolicy; unsigned long long trimmed = 0; int err = 0; - bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi); + bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi); if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; @@ -3193,7 +3195,7 @@ static void update_device_state(struct f2fs_io_info *fio) static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio); - bool keep_order = (test_opt(fio->sbi, LFS) && type == CURSEG_COLD_DATA); + bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA); if (keep_order) down_read(&fio->sbi->io_order_lock); @@ -4071,7 +4073,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_i->dirty_sentries = 0; sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); - sit_i->mounted_time = ktime_get_real_seconds(); + sit_i->mounted_time = ktime_get_boottime_seconds(); init_rwsem(&sit_i->sentry_lock); return 0; } @@ -4678,7 +4680,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS) sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS; - if (!test_opt(sbi, LFS)) + if (!f2fs_lfs_mode(sbi)) sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; @@ -4830,22 +4832,22 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) int __init f2fs_create_segment_manager_caches(void) { - discard_entry_slab = f2fs_kmem_cache_create("discard_entry", + discard_entry_slab = f2fs_kmem_cache_create("f2fs_discard_entry", sizeof(struct discard_entry)); if (!discard_entry_slab) goto fail; - discard_cmd_slab = f2fs_kmem_cache_create("discard_cmd", + discard_cmd_slab = f2fs_kmem_cache_create("f2fs_discard_cmd", sizeof(struct discard_cmd)); if (!discard_cmd_slab) goto destroy_discard_entry; - sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", + sit_entry_set_slab = f2fs_kmem_cache_create("f2fs_sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) goto destroy_discard_cmd; - inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", + inmem_entry_slab = f2fs_kmem_cache_create("f2fs_inmem_page_entry", sizeof(struct inmem_pages)); if (!inmem_entry_slab) goto destroy_sit_entry_set; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 459dc3901a57..7a83bd530812 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -756,7 +756,7 @@ static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi, bool base_time) { struct sit_info *sit_i = SIT_I(sbi); - time64_t diff, now = ktime_get_real_seconds(); + time64_t diff, now = ktime_get_boottime_seconds(); if (now >= sit_i->mounted_time) return sit_i->elapsed_time + now - sit_i->mounted_time; diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index a467aca29cfe..d66de5999a26 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -58,7 +58,7 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, /* count extent cache entries */ count += __count_extent_cache(sbi); - /* shrink clean nat cache entries */ + /* count clean nat cache entries */ count += __count_nat_entries(sbi); /* count free nids cache entries */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d398b2d90c6c..f2dfc21c6abb 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -428,14 +428,11 @@ static int parse_options(struct super_block *sb, char *options) if (!name) return -ENOMEM; if (strlen(name) == 2 && !strncmp(name, "on", 2)) { - set_opt(sbi, BG_GC); - clear_opt(sbi, FORCE_FG_GC); + F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; } else if (strlen(name) == 3 && !strncmp(name, "off", 3)) { - clear_opt(sbi, BG_GC); - clear_opt(sbi, FORCE_FG_GC); + F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_OFF; } else if (strlen(name) == 4 && !strncmp(name, "sync", 4)) { - set_opt(sbi, BG_GC); - set_opt(sbi, FORCE_FG_GC); + F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_SYNC; } else { kvfree(name); return -EINVAL; @@ -447,7 +444,7 @@ static int parse_options(struct super_block *sb, char *options) break; case Opt_norecovery: /* this option mounts f2fs with ro */ - set_opt(sbi, DISABLE_ROLL_FORWARD); + set_opt(sbi, NORECOVERY); if (!f2fs_readonly(sb)) return -EINVAL; break; @@ -601,10 +598,10 @@ static int parse_options(struct super_block *sb, char *options) kvfree(name); return -EINVAL; } - set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); + F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; } else if (strlen(name) == 3 && !strncmp(name, "lfs", 3)) { - set_opt_mode(sbi, F2FS_MOUNT_LFS); + F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; } else { kvfree(name); return -EINVAL; @@ -833,6 +830,10 @@ static int parse_options(struct super_block *sb, char *options) !strcmp(name, "lz4")) { F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; + } else if (strlen(name) == 4 && + !strcmp(name, "zstd")) { + F2FS_OPTION(sbi).compress_algorithm = + COMPRESS_ZSTD; } else { kfree(name); return -EINVAL; @@ -905,7 +906,7 @@ static int parse_options(struct super_block *sb, char *options) } #endif - if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { + if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) { f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO", F2FS_IO_SIZE_KB(sbi)); return -EINVAL; @@ -935,7 +936,7 @@ static int parse_options(struct super_block *sb, char *options) } } - if (test_opt(sbi, DISABLE_CHECKPOINT) && test_opt(sbi, LFS)) { + if (test_opt(sbi, DISABLE_CHECKPOINT) && f2fs_lfs_mode(sbi)) { f2fs_err(sbi, "LFS not compatible with checkpoint=disable\n"); return -EINVAL; } @@ -961,6 +962,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ atomic_set(&fi->dirty_pages, 0); init_rwsem(&fi->i_sem); + spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); INIT_LIST_HEAD(&fi->inmem_ilist); @@ -1173,7 +1175,7 @@ static void f2fs_put_super(struct super_block *sb) /* our cp_error case, we can wait for any writeback page */ f2fs_flush_merged_writes(sbi); - f2fs_wait_on_all_pages_writeback(sbi); + f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); f2fs_bug_on(sbi, sbi->fsync_node_num); @@ -1205,6 +1207,7 @@ static void f2fs_put_super(struct super_block *sb) kvfree(sbi->raw_super); destroy_device_list(sbi); + f2fs_destroy_xattr_caches(sbi); mempool_destroy(sbi->write_io_dummy); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) @@ -1421,6 +1424,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, case COMPRESS_LZ4: algtype = "lz4"; break; + case COMPRESS_ZSTD: + algtype = "zstd"; + break; } seq_printf(seq, ",compress_algorithm=%s", algtype); @@ -1437,16 +1443,17 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); - if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) { - if (test_opt(sbi, FORCE_FG_GC)) - seq_printf(seq, ",background_gc=%s", "sync"); - else - seq_printf(seq, ",background_gc=%s", "on"); - } else { + if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC) + seq_printf(seq, ",background_gc=%s", "sync"); + else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_ON) + seq_printf(seq, ",background_gc=%s", "on"); + else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) seq_printf(seq, ",background_gc=%s", "off"); - } + if (test_opt(sbi, DISABLE_ROLL_FORWARD)) seq_puts(seq, ",disable_roll_forward"); + if (test_opt(sbi, NORECOVERY)) + seq_puts(seq, ",norecovery"); if (test_opt(sbi, DISCARD)) seq_puts(seq, ",discard"); else @@ -1498,9 +1505,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",data_flush"); seq_puts(seq, ",mode="); - if (test_opt(sbi, ADAPTIVE)) + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_ADAPTIVE) seq_puts(seq, "adaptive"); - else if (test_opt(sbi, LFS)) + else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS) seq_puts(seq, "lfs"); seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs); if (test_opt(sbi, RESERVE_ROOT)) @@ -1571,11 +1578,11 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).test_dummy_encryption = false; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); - F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO; + F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE; F2FS_OPTION(sbi).compress_ext_cnt = 0; + F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; - set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); @@ -1587,9 +1594,9 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, FLUSH_MERGE); set_opt(sbi, DISCARD); if (f2fs_sb_has_blkzoned(sbi)) - set_opt_mode(sbi, F2FS_MOUNT_LFS); + F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; else - set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); + F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); @@ -1658,7 +1665,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) out_unlock: up_write(&sbi->gc_lock); restore_flag: - sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ return err; } @@ -1781,7 +1788,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * or if background_gc = off is passed in mount * option. Also sync the filesystem. */ - if ((*flags & SB_RDONLY) || !test_opt(sbi, BG_GC)) { + if ((*flags & SB_RDONLY) || + F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) { if (sbi->gc_thread) { f2fs_stop_gc_thread(sbi); need_restart_gc = true; @@ -1886,7 +1894,8 @@ repeat: page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS); if (IS_ERR(page)) { if (PTR_ERR(page) == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, + DEFAULT_IO_TIMEOUT); goto repeat; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); @@ -1928,6 +1937,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, int offset = off & (sb->s_blocksize - 1); size_t towrite = len; struct page *page; + void *fsdata = NULL; char *kaddr; int err = 0; int tocopy; @@ -1937,10 +1947,11 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, towrite); retry: err = a_ops->write_begin(NULL, mapping, off, tocopy, 0, - &page, NULL); + &page, &fsdata); if (unlikely(err)) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + congestion_wait(BLK_RW_ASYNC, + DEFAULT_IO_TIMEOUT); goto retry; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); @@ -1953,7 +1964,7 @@ retry: flush_dcache_page(page); a_ops->write_end(NULL, mapping, off, tocopy, tocopy, - page, NULL); + page, fsdata); offset = 0; towrite -= tocopy; off += tocopy; @@ -3457,12 +3468,17 @@ try_onemore: } } + /* init per sbi slab cache */ + err = f2fs_init_xattr_caches(sbi); + if (err) + goto free_io_dummy; + /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { f2fs_err(sbi, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); - goto free_io_dummy; + goto free_xattr_cache; } err = f2fs_get_valid_checkpoint(sbi); @@ -3590,7 +3606,7 @@ try_onemore: f2fs_err(sbi, "Cannot turn on quotas: error %d", err); } #endif - /* if there are nt orphan nodes free them */ + /* if there are any orphan inodes, free them */ err = f2fs_recover_orphan_inodes(sbi); if (err) goto free_meta; @@ -3599,7 +3615,8 @@ try_onemore: goto reset_checkpoint; /* recover fsynced data */ - if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { + if (!test_opt(sbi, DISABLE_ROLL_FORWARD) && + !test_opt(sbi, NORECOVERY)) { /* * mount should be failed, when device has readonly mode, and * previous checkpoint was not done by clean system shutdown. @@ -3665,7 +3682,7 @@ reset_checkpoint: * If filesystem is not mounted as read-only then * do start the gc_thread. */ - if (test_opt(sbi, BG_GC) && !f2fs_readonly(sb)) { + if (F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF && !f2fs_readonly(sb)) { /* After POR, we can run background GC thread.*/ err = f2fs_start_gc_thread(sbi); if (err) @@ -3734,6 +3751,8 @@ free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); sbi->meta_inode = NULL; +free_xattr_cache: + f2fs_destroy_xattr_caches(sbi); free_io_dummy: mempool_destroy(sbi->write_io_dummy); free_percpu: diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 91d649790b1b..e3bbbef9b4f0 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -109,47 +109,47 @@ static ssize_t features_show(struct f2fs_attr *a, return sprintf(buf, "0\n"); if (f2fs_sb_has_encrypt(sbi)) - len += snprintf(buf, PAGE_SIZE - len, "%s", + len += scnprintf(buf, PAGE_SIZE - len, "%s", "encryption"); if (f2fs_sb_has_blkzoned(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "blkzoned"); if (f2fs_sb_has_extra_attr(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "extra_attr"); if (f2fs_sb_has_project_quota(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "projquota"); if (f2fs_sb_has_inode_chksum(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "inode_checksum"); if (f2fs_sb_has_flexible_inline_xattr(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "flexible_inline_xattr"); if (f2fs_sb_has_quota_ino(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "quota_ino"); if (f2fs_sb_has_inode_crtime(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "inode_crtime"); if (f2fs_sb_has_lost_found(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "lost_found"); if (f2fs_sb_has_verity(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "verity"); if (f2fs_sb_has_sb_chksum(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "sb_checksum"); if (f2fs_sb_has_casefold(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "casefold"); if (f2fs_sb_has_compression(sbi)) - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "compression"); - len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s", len ? ", " : "", "pin_file"); - len += snprintf(buf + len, PAGE_SIZE - len, "\n"); + len += scnprintf(buf + len, PAGE_SIZE - len, "\n"); return len; } @@ -185,6 +185,12 @@ static ssize_t encoding_show(struct f2fs_attr *a, return sprintf(buf, "(none)"); } +static ssize_t mounted_time_sec_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%llu", SIT_I(sbi)->mounted_time); +} + #ifdef CONFIG_F2FS_STAT_FS static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) @@ -233,16 +239,16 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, int hot_count = sbi->raw_super->hot_ext_count; int len = 0, i; - len += snprintf(buf + len, PAGE_SIZE - len, + len += scnprintf(buf + len, PAGE_SIZE - len, "cold file extension:\n"); for (i = 0; i < cold_count; i++) - len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n", extlist[i]); - len += snprintf(buf + len, PAGE_SIZE - len, + len += scnprintf(buf + len, PAGE_SIZE - len, "hot file extension:\n"); for (i = cold_count; i < cold_count + hot_count; i++) - len += snprintf(buf + len, PAGE_SIZE - len, "%s\n", + len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n", extlist[i]); return len; } @@ -544,6 +550,7 @@ F2FS_GENERAL_RO_ATTR(features); F2FS_GENERAL_RO_ATTR(current_reserved_blocks); F2FS_GENERAL_RO_ATTR(unusable); F2FS_GENERAL_RO_ATTR(encoding); +F2FS_GENERAL_RO_ATTR(mounted_time_sec); #ifdef CONFIG_F2FS_STAT_FS F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count); F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count); @@ -573,7 +580,9 @@ F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY); #endif F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD); +#ifdef CONFIG_F2FS_FS_COMPRESSION F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION); +#endif #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -621,6 +630,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(reserved_blocks), ATTR_LIST(current_reserved_blocks), ATTR_LIST(encoding), + ATTR_LIST(mounted_time_sec), #ifdef CONFIG_F2FS_STAT_FS ATTR_LIST(cp_foreground_calls), ATTR_LIST(cp_background_calls), @@ -654,7 +664,9 @@ static struct attribute *f2fs_feat_attrs[] = { #endif ATTR_LIST(sb_checksum), ATTR_LIST(casefold), +#ifdef CONFIG_F2FS_FS_COMPRESSION ATTR_LIST(compression), +#endif NULL, }; ATTRIBUTE_GROUPS(f2fs_feat); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 296b3189448a..4f6582ef7ee3 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -23,6 +23,25 @@ #include "xattr.h" #include "segment.h" +static void *xattr_alloc(struct f2fs_sb_info *sbi, int size, bool *is_inline) +{ + if (likely(size == sbi->inline_xattr_slab_size)) { + *is_inline = true; + return kmem_cache_zalloc(sbi->inline_xattr_slab, GFP_NOFS); + } + *is_inline = false; + return f2fs_kzalloc(sbi, size, GFP_NOFS); +} + +static void xattr_free(struct f2fs_sb_info *sbi, void *xattr_addr, + bool is_inline) +{ + if (is_inline) + kmem_cache_free(sbi->inline_xattr_slab, xattr_addr); + else + kvfree(xattr_addr); +} + static int f2fs_xattr_generic_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) @@ -301,7 +320,8 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr) static int lookup_all_xattrs(struct inode *inode, struct page *ipage, unsigned int index, unsigned int len, const char *name, struct f2fs_xattr_entry **xe, - void **base_addr, int *base_size) + void **base_addr, int *base_size, + bool *is_inline) { void *cur_addr, *txattr_addr, *last_txattr_addr; void *last_addr = NULL; @@ -312,12 +332,12 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, if (!xnid && !inline_size) return -ENODATA; - *base_size = XATTR_SIZE(xnid, inode) + XATTR_PADDING_SIZE; - txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), *base_size, GFP_NOFS); + *base_size = XATTR_SIZE(inode) + XATTR_PADDING_SIZE; + txattr_addr = xattr_alloc(F2FS_I_SB(inode), *base_size, is_inline); if (!txattr_addr) return -ENOMEM; - last_txattr_addr = (void *)txattr_addr + XATTR_SIZE(xnid, inode); + last_txattr_addr = (void *)txattr_addr + XATTR_SIZE(inode); /* read from inline xattr */ if (inline_size) { @@ -362,7 +382,7 @@ check: *base_addr = txattr_addr; return 0; out: - kvfree(txattr_addr); + xattr_free(F2FS_I_SB(inode), txattr_addr, *is_inline); return err; } @@ -499,6 +519,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, unsigned int size, len; void *base_addr = NULL; int base_size; + bool is_inline; if (name == NULL) return -EINVAL; @@ -509,7 +530,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, down_read(&F2FS_I(inode)->i_xattr_sem); error = lookup_all_xattrs(inode, ipage, index, len, name, - &entry, &base_addr, &base_size); + &entry, &base_addr, &base_size, &is_inline); up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -532,14 +553,13 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, } error = size; out: - kvfree(base_addr); + xattr_free(F2FS_I_SB(inode), base_addr, is_inline); return error; } ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { struct inode *inode = d_inode(dentry); - nid_t xnid = F2FS_I(inode)->i_xattr_nid; struct f2fs_xattr_entry *entry; void *base_addr, *last_base_addr; int error = 0; @@ -551,7 +571,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) if (error) return error; - last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode); + last_base_addr = (void *)base_addr + XATTR_SIZE(inode); list_for_each_xattr(entry, base_addr) { const struct xattr_handler *handler = @@ -609,7 +629,6 @@ static int __f2fs_setxattr(struct inode *inode, int index, { struct f2fs_xattr_entry *here, *last; void *base_addr, *last_base_addr; - nid_t xnid = F2FS_I(inode)->i_xattr_nid; int found, newsize; size_t len; __u32 new_hsize; @@ -633,7 +652,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (error) return error; - last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode); + last_base_addr = (void *)base_addr + XATTR_SIZE(inode); /* find entry with wanted name. */ here = __find_xattr(base_addr, last_base_addr, index, len, name); @@ -758,14 +777,34 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); - /* protect xattr_ver */ - down_write(&F2FS_I(inode)->i_sem); down_write(&F2FS_I(inode)->i_xattr_sem); err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); up_write(&F2FS_I(inode)->i_xattr_sem); - up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); f2fs_update_time(sbi, REQ_TIME); return err; } + +int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + char slab_name[32]; + + sprintf(slab_name, "f2fs_xattr_entry-%u:%u", MAJOR(dev), MINOR(dev)); + + sbi->inline_xattr_slab_size = F2FS_OPTION(sbi).inline_xattr_size * + sizeof(__le32) + XATTR_PADDING_SIZE; + + sbi->inline_xattr_slab = f2fs_kmem_cache_create(slab_name, + sbi->inline_xattr_slab_size); + if (!sbi->inline_xattr_slab) + return -ENOMEM; + + return 0; +} + +void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi) +{ + kmem_cache_destroy(sbi->inline_xattr_slab); +} diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index de0c600b9cab..938fcd20565d 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -49,7 +49,7 @@ struct f2fs_xattr_entry { __u8 e_name_index; __u8 e_name_len; __le16 e_value_size; /* size of attribute value */ - char e_name[0]; /* attribute name */ + char e_name[]; /* attribute name */ }; #define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr)) @@ -73,7 +73,8 @@ struct f2fs_xattr_entry { entry = XATTR_NEXT_ENTRY(entry)) #define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) #define XATTR_PADDING_SIZE (sizeof(__u32)) -#define XATTR_SIZE(x,i) (((x) ? VALID_XATTR_BLOCK_SIZE : 0) + \ +#define XATTR_SIZE(i) ((F2FS_I(i)->i_xattr_nid ? \ + VALID_XATTR_BLOCK_SIZE : 0) + \ (inline_xattr_size(i))) #define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ VALID_XATTR_BLOCK_SIZE) @@ -130,6 +131,8 @@ extern int f2fs_setxattr(struct inode *, int, const char *, extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t, struct page *); extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t); +extern int f2fs_init_xattr_caches(struct f2fs_sb_info *); +extern void f2fs_destroy_xattr_caches(struct f2fs_sb_info *); #else #define f2fs_xattr_handlers NULL @@ -150,6 +153,8 @@ static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, { return -EOPNOTSUPP; } +static inline int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi) { } #endif #ifdef CONFIG_F2FS_FS_SECURITY diff --git a/fs/filesystems.c b/fs/filesystems.c index 77bf5f95362d..90b8d879fbaf 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -272,7 +272,9 @@ struct file_system_type *get_fs_type(const char *name) fs = __get_fs_type(name, len); if (!fs && (request_module("fs-%.*s", len, name) == 0)) { fs = __get_fs_type(name, len); - WARN_ONCE(!fs, "request_module fs-%.*s succeeded, but still no fs?\n", len, name); + if (!fs) + pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n", + len, name); } if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 7e6fb43f9541..ab53e42a874a 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -368,8 +368,6 @@ bool fs_validate_description(const char *name, const struct fs_parameter_spec *param, *p2; bool good = true; - pr_notice("*** VALIDATE %s ***\n", name); - for (param = desc; param->name; param++) { /* Check for duplicate parameter names */ for (p2 = desc; p2 < param; p2++) { diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c index e6d554476db4..eeebe80c6be4 100644 --- a/fs/hfsplus/attributes.c +++ b/fs/hfsplus/attributes.c @@ -292,6 +292,10 @@ static int __hfsplus_delete_attr(struct inode *inode, u32 cnid, return -ENOENT; } + /* Avoid btree corruption */ + hfs_bnode_read(fd->bnode, fd->search_key, + fd->keyoffset, fd->keylength); + err = hfs_brec_remove(fd); if (err) return err; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index e6b8c49076bb..c070c0d8e3e9 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -139,8 +139,8 @@ static char *inode_name(struct inode *ino) static char *follow_link(char *link) { - int len, n; char *name, *resolved, *end; + int n; name = __getname(); if (!name) { @@ -164,15 +164,13 @@ static char *follow_link(char *link) return name; *(end + 1) = '\0'; - len = strlen(link) + strlen(name) + 1; - resolved = kmalloc(len, GFP_KERNEL); + resolved = kasprintf(GFP_KERNEL, "%s%s", link, name); if (resolved == NULL) { n = -ENOMEM; goto out_free; } - sprintf(resolved, "%s%s", link, name); __putname(name); kfree(link); return resolved; @@ -921,18 +919,16 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) sb->s_d_op = &simple_dentry_operations; sb->s_maxbytes = MAX_LFS_FILESIZE; - /* NULL is printed as <NULL> by sprintf: avoid that. */ + /* NULL is printed as '(null)' by printf(): avoid that. */ if (req_root == NULL) req_root = ""; err = -ENOMEM; sb->s_fs_info = host_root_path = - kmalloc(strlen(root_ino) + strlen(req_root) + 2, GFP_KERNEL); + kasprintf(GFP_KERNEL, "%s/%s", root_ino, req_root); if (host_root_path == NULL) goto out; - sprintf(host_root_path, "%s/%s", root_ino, req_root); - root_inode = new_inode(sb); if (!root_inode) goto out; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index aff8642f0c2e..991c60c7ffe0 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -393,10 +393,9 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv * maps and global counts. Page faults can not race with truncation - * in this routine. hugetlb_no_page() prevents page faults in the - * truncated range. It checks i_size before allocation, and again after - * with the page table lock for the page held. The same lock must be - * acquired to unmap a page. + * in this routine. hugetlb_no_page() holds i_mmap_rwsem and prevents + * page faults in the truncated range by checking i_size. i_size is + * modified while holding i_mmap_rwsem. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserv map @@ -436,7 +435,15 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, index = page->index; hash = hugetlb_fault_mutex_hash(mapping, index); - mutex_lock(&hugetlb_fault_mutex_table[hash]); + if (!truncate_op) { + /* + * Only need to hold the fault mutex in the + * hole punch case. This prevents races with + * page faults. Races are not possible in the + * case of truncation. + */ + mutex_lock(&hugetlb_fault_mutex_table[hash]); + } /* * If page is mapped, it was faulted in after being @@ -450,7 +457,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, if (unlikely(page_mapped(page))) { BUG_ON(truncate_op); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_lock_write(mapping); + mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vmdelete_list(&mapping->i_mmap, index * pages_per_huge_page(h), (index + 1) * pages_per_huge_page(h)); @@ -477,7 +486,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, } unlock_page(page); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); + if (!truncate_op) + mutex_unlock(&hugetlb_fault_mutex_table[hash]); } huge_pagevec_release(&pvec); cond_resched(); @@ -515,8 +525,8 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) BUG_ON(offset & ~huge_page_mask(h)); pgoff = offset >> PAGE_SHIFT; - i_size_write(inode, offset); i_mmap_lock_write(mapping); + i_size_write(inode, offset); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); i_mmap_unlock_write(mapping); @@ -638,7 +648,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, /* addr is the offset within the file (zero based) */ addr = index * hpage_size; - /* mutex taken here, fault path and hole punch */ + /* + * fault mutex taken here, protects against fault path + * and hole punch. inode_lock previously taken protects + * against truncation. + */ hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); diff --git a/fs/internal.h b/fs/internal.h index 4d37912a5587..aa5d45524e87 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -60,7 +60,6 @@ extern int finish_clean_context(struct fs_context *fc); */ extern int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root); -extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); long do_mknodat(int dfd, const char __user *filename, umode_t mode, diff --git a/fs/io-wq.c b/fs/io-wq.c index cc5cf2209fb0..4023c9846860 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -17,6 +17,7 @@ #include <linux/kthread.h> #include <linux/rculist_nulls.h> #include <linux/fs_struct.h> +#include <linux/task_work.h> #include "io-wq.h" @@ -716,6 +717,9 @@ static int io_wq_manager(void *data) complete(&wq->done); while (!kthread_should_stop()) { + if (current->task_works) + task_work_run(); + for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; bool fork_worker[2] = { false, false }; @@ -738,6 +742,9 @@ static int io_wq_manager(void *data) schedule_timeout(HZ); } + if (current->task_works) + task_work_run(); + return 0; err: set_bit(IO_WQ_BIT_ERROR, &wq->state); @@ -1124,3 +1131,8 @@ void io_wq_destroy(struct io_wq *wq) if (refcount_dec_and_test(&wq->use_refs)) __io_wq_destroy(wq); } + +struct task_struct *io_wq_get_task(struct io_wq *wq) +{ + return wq->manager; +} diff --git a/fs/io-wq.h b/fs/io-wq.h index 3ee7356d6be5..5ba12de7572f 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -136,6 +136,8 @@ typedef bool (work_cancel_fn)(struct io_wq_work *, void *); enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, void *data); +struct task_struct *io_wq_get_task(struct io_wq *wq); + #if defined(CONFIG_IO_WQ) extern void io_wq_worker_sleeping(struct task_struct *); extern void io_wq_worker_running(struct task_struct *); diff --git a/fs/io_uring.c b/fs/io_uring.c index 358f97be9c7b..5190bfb6a665 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -186,14 +186,23 @@ struct fixed_file_table { struct file **files; }; +struct fixed_file_ref_node { + struct percpu_ref refs; + struct list_head node; + struct list_head file_list; + struct fixed_file_data *file_data; + struct work_struct work; +}; + struct fixed_file_data { struct fixed_file_table *table; struct io_ring_ctx *ctx; + struct percpu_ref *cur_refs; struct percpu_ref refs; - struct llist_head put_llist; - struct work_struct ref_work; struct completion done; + struct list_head ref_list; + spinlock_t lock; }; struct io_buffer { @@ -317,6 +326,8 @@ struct io_ring_ctx { spinlock_t inflight_lock; struct list_head inflight_list; } ____cacheline_aligned_in_smp; + + struct work_struct exit_work; }; /* @@ -599,6 +610,7 @@ struct io_kiocb { }; struct io_async_ctx *io; + int cflags; bool needs_fixed_file; u8 opcode; @@ -606,10 +618,8 @@ struct io_kiocb { struct list_head list; unsigned int flags; refcount_t refs; - union { - struct task_struct *task; - unsigned long fsize; - }; + struct task_struct *task; + unsigned long fsize; u64 user_data; u32 result; u32 sequence; @@ -618,6 +628,8 @@ struct io_kiocb { struct list_head inflight_entry; + struct percpu_ref *fixed_file_refs; + union { /* * Only commands that never go async can use the below fields, @@ -629,7 +641,6 @@ struct io_kiocb { struct callback_head task_work; struct hlist_node hash_node; struct async_poll *apoll; - int cflags; }; struct io_wq_work work; }; @@ -848,7 +859,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_files_update *ip, unsigned nr_args); static int io_grab_files(struct io_kiocb *req); -static void io_ring_file_ref_flush(struct fixed_file_data *data); static void io_cleanup_req(struct io_kiocb *req); static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, int fd, struct file **out_file, bool fixed); @@ -1285,8 +1295,8 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx) return NULL; } -static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, - struct io_submit_state *state) +static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx, + struct io_submit_state *state) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct io_kiocb *req; @@ -1319,41 +1329,20 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, req = state->reqs[state->free_reqs]; } -got_it: - req->io = NULL; - req->file = NULL; - req->ctx = ctx; - req->flags = 0; - /* one is dropped after submission, the other at completion */ - refcount_set(&req->refs, 2); - req->result = 0; - INIT_IO_WORK(&req->work, io_wq_submit_work); return req; fallback: - req = io_get_fallback_req(ctx); - if (req) - goto got_it; - percpu_ref_put(&ctx->refs); - return NULL; + return io_get_fallback_req(ctx); } static inline void io_put_file(struct io_kiocb *req, struct file *file, bool fixed) { if (fixed) - percpu_ref_put(&req->ctx->file_data->refs); + percpu_ref_put(req->fixed_file_refs); else fput(file); } -static void __io_req_do_free(struct io_kiocb *req) -{ - if (likely(!io_is_fallback_req(req))) - kmem_cache_free(req_cachep, req); - else - clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req); -} - static void __io_req_aux_free(struct io_kiocb *req) { if (req->flags & REQ_F_NEED_CLEANUP) @@ -1362,6 +1351,8 @@ static void __io_req_aux_free(struct io_kiocb *req) kfree(req->io); if (req->file) io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); + if (req->task) + put_task_struct(req->task); io_req_work_drop_env(req); } @@ -1382,7 +1373,10 @@ static void __io_free_req(struct io_kiocb *req) } percpu_ref_put(&req->ctx->refs); - __io_req_do_free(req); + if (likely(!io_is_fallback_req(req))) + kmem_cache_free(req_cachep, req); + else + clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req); } struct req_batch { @@ -1393,21 +1387,18 @@ struct req_batch { static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) { - int fixed_refs = rb->to_free; - if (!rb->to_free) return; if (rb->need_iter) { int i, inflight = 0; unsigned long flags; - fixed_refs = 0; for (i = 0; i < rb->to_free; i++) { struct io_kiocb *req = rb->reqs[i]; if (req->flags & REQ_F_FIXED_FILE) { req->file = NULL; - fixed_refs++; + percpu_ref_put(req->fixed_file_refs); } if (req->flags & REQ_F_INFLIGHT) inflight++; @@ -1433,8 +1424,6 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) } do_free: kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs); - if (fixed_refs) - percpu_ref_put_many(&ctx->file_data->refs, fixed_refs); percpu_ref_put_many(&ctx->refs, rb->to_free); rb->to_free = rb->need_iter = 0; } @@ -1738,11 +1727,24 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, io_free_req_many(ctx, &rb); } +static void io_iopoll_queue(struct list_head *again) +{ + struct io_kiocb *req; + + do { + req = list_first_entry(again, struct io_kiocb, list); + list_del(&req->list); + refcount_inc(&req->refs); + io_queue_async_work(req); + } while (!list_empty(again)); +} + static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, long min) { struct io_kiocb *req, *tmp; LIST_HEAD(done); + LIST_HEAD(again); bool spin; int ret; @@ -1757,9 +1759,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, struct kiocb *kiocb = &req->rw.kiocb; /* - * Move completed entries to our local list. If we find a - * request that requires polling, break out and complete - * the done list first, if we have entries there. + * Move completed and retryable entries to our local lists. + * If we find a request that requires polling, break out + * and complete those lists first, if we have entries there. */ if (req->flags & REQ_F_IOPOLL_COMPLETED) { list_move_tail(&req->list, &done); @@ -1768,6 +1770,13 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (!list_empty(&done)) break; + if (req->result == -EAGAIN) { + list_move_tail(&req->list, &again); + continue; + } + if (!list_empty(&again)) + break; + ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); if (ret < 0) break; @@ -1780,6 +1789,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (!list_empty(&done)) io_iopoll_complete(ctx, nr_events, &done); + if (!list_empty(&again)) + io_iopoll_queue(&again); + return ret; } @@ -2465,8 +2477,9 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, req->io->rw.iov = iovec; if (!req->io->rw.iov) { req->io->rw.iov = req->io->rw.fast_iov; - memcpy(req->io->rw.iov, fast_iov, - sizeof(struct iovec) * iter->nr_segs); + if (req->io->rw.iov != fast_iov) + memcpy(req->io->rw.iov, fast_iov, + sizeof(struct iovec) * iter->nr_segs); } else { req->flags |= REQ_F_NEED_CLEANUP; } @@ -2920,7 +2933,7 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sqe->ioprio || sqe->buf_index) return -EINVAL; - if (sqe->flags & IOSQE_FIXED_FILE) + if (req->flags & REQ_F_FIXED_FILE) return -EBADF; if (req->flags & REQ_F_NEED_CLEANUP) return 0; @@ -2929,6 +2942,8 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->open.how.mode = READ_ONCE(sqe->len); fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); req->open.how.flags = READ_ONCE(sqe->open_flags); + if (force_o_largefile()) + req->open.how.flags |= O_LARGEFILE; req->open.filename = getname(fname); if (IS_ERR(req->open.filename)) { @@ -2951,7 +2966,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sqe->ioprio || sqe->buf_index) return -EINVAL; - if (sqe->flags & IOSQE_FIXED_FILE) + if (req->flags & REQ_F_FIXED_FILE) return -EBADF; if (req->flags & REQ_F_NEED_CLEANUP) return 0; @@ -3305,7 +3320,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sqe->ioprio || sqe->buf_index) return -EINVAL; - if (sqe->flags & IOSQE_FIXED_FILE) + if (req->flags & REQ_F_FIXED_FILE) return -EBADF; if (req->flags & REQ_F_NEED_CLEANUP) return 0; @@ -3382,7 +3397,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) return -EINVAL; - if (sqe->flags & IOSQE_FIXED_FILE) + if (req->flags & REQ_F_FIXED_FILE) return -EBADF; req->close.fd = READ_ONCE(sqe->fd); @@ -3481,14 +3496,11 @@ static void __io_sync_file_range(struct io_kiocb *req) static void io_sync_file_range_finish(struct io_wq_work **workptr) { struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct io_kiocb *nxt = NULL; if (io_req_cancelled(req)) return; __io_sync_file_range(req); io_put_req(req); /* put submission ref */ - if (nxt) - io_wq_assign_next(workptr, nxt); } static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) @@ -4114,6 +4126,7 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, __poll_t mask, task_work_func_t func) { struct task_struct *tsk; + int ret; /* for instances that support it check for an event match first: */ if (mask && !(mask & poll->events)) @@ -4127,11 +4140,15 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, req->result = mask; init_task_work(&req->task_work, func); /* - * If this fails, then the task is exiting. If that is the case, then - * the exit check will ultimately cancel these work items. Hence we - * don't need to check here and handle it specifically. + * If this fails, then the task is exiting. Punt to one of the io-wq + * threads to ensure the work gets run, we can't always rely on exit + * cancelation taking care of this. */ - task_work_add(tsk, &req->task_work, true); + ret = task_work_add(tsk, &req->task_work, true); + if (unlikely(ret)) { + tsk = io_wq_get_task(req->ctx->io_wq); + task_work_add(tsk, &req->task_work, true); + } wake_up_process(tsk); return 1; } @@ -4251,10 +4268,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req) req->flags |= REQ_F_POLLED; memcpy(&apoll->work, &req->work, sizeof(req->work)); - /* - * Don't need a reference here, as we're adding it to the task - * task_works list. If the task exits, the list is pruned. - */ + get_task_struct(current); req->task = current; req->apoll = apoll; INIT_HLIST_NODE(&req->hash_node); @@ -4407,8 +4421,20 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) { struct io_ring_ctx *ctx = req->ctx; + struct io_poll_iocb *poll = &req->poll; + + if (!req->result && !READ_ONCE(poll->canceled)) { + struct poll_table_struct pt = { ._key = poll->events }; + + req->result = vfs_poll(req->file, &pt) & poll->events; + } spin_lock_irq(&ctx->completion_lock); + if (!req->result && !READ_ONCE(poll->canceled)) { + add_wait_queue(poll->head, &poll->wait); + spin_unlock_irq(&ctx->completion_lock); + return; + } hash_del(&req->hash_node); io_poll_complete(req, req->result, 0); req->flags |= REQ_F_COMP_LOCKED; @@ -4465,10 +4491,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; - /* - * Don't need a reference here, as we're adding it to the task - * task_works list. If the task exits, the list is pruned. - */ + get_task_struct(current); req->task = current; return 0; } @@ -5331,7 +5354,8 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, file = io_file_from_index(ctx, fd); if (!file) return -EBADF; - percpu_ref_get(&ctx->file_data->refs); + req->fixed_file_refs = ctx->file_data->cur_refs; + percpu_ref_get(req->fixed_file_refs); } else { trace_io_uring_file_get(ctx, fd); file = __io_file_get(state, fd); @@ -5344,15 +5368,10 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, } static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, - const struct io_uring_sqe *sqe) + int fd, unsigned int flags) { - unsigned flags; - int fd; bool fixed; - flags = READ_ONCE(sqe->flags); - fd = READ_ONCE(sqe->fd); - if (!io_req_needs_file(req, fd)) return 0; @@ -5594,7 +5613,7 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, { struct io_ring_ctx *ctx = req->ctx; unsigned int sqe_flags; - int ret, id; + int ret, id, fd; sqe_flags = READ_ONCE(sqe->flags); @@ -5625,7 +5644,8 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, IOSQE_ASYNC | IOSQE_FIXED_FILE | IOSQE_BUFFER_SELECT); - ret = io_req_set_file(state, req, sqe); + fd = READ_ONCE(sqe->fd); + ret = io_req_set_file(state, req, fd, sqe_flags); if (unlikely(ret)) { err_req: io_cqring_add_event(req, ret); @@ -5741,8 +5761,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ -static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe **sqe_ptr) +static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) { u32 *sq_array = ctx->sq_array; unsigned head; @@ -5756,25 +5775,40 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, * though the application is the one updating it. */ head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]); - if (likely(head < ctx->sq_entries)) { - /* - * All io need record the previous position, if LINK vs DARIN, - * it can be used to mark the position of the first IO in the - * link list. - */ - req->sequence = ctx->cached_sq_head; - *sqe_ptr = &ctx->sq_sqes[head]; - req->opcode = READ_ONCE((*sqe_ptr)->opcode); - req->user_data = READ_ONCE((*sqe_ptr)->user_data); - ctx->cached_sq_head++; - return true; - } + if (likely(head < ctx->sq_entries)) + return &ctx->sq_sqes[head]; /* drop invalid entries */ - ctx->cached_sq_head++; ctx->cached_sq_dropped++; WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped); - return false; + return NULL; +} + +static inline void io_consume_sqe(struct io_ring_ctx *ctx) +{ + ctx->cached_sq_head++; +} + +static void io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + /* + * All io need record the previous position, if LINK vs DARIN, + * it can be used to mark the position of the first IO in the + * link list. + */ + req->sequence = ctx->cached_sq_head; + req->opcode = READ_ONCE(sqe->opcode); + req->user_data = READ_ONCE(sqe->user_data); + req->io = NULL; + req->file = NULL; + req->ctx = ctx; + req->flags = 0; + /* one is dropped after submission, the other at completion */ + refcount_set(&req->refs, 2); + req->task = NULL; + req->result = 0; + INIT_IO_WORK(&req->work, io_wq_submit_work); } static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, @@ -5812,17 +5846,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, struct io_kiocb *req; int err; - req = io_get_req(ctx, statep); + sqe = io_get_sqe(ctx); + if (unlikely(!sqe)) { + io_consume_sqe(ctx); + break; + } + req = io_alloc_req(ctx, statep); if (unlikely(!req)) { if (!submitted) submitted = -EAGAIN; break; } - if (!io_get_sqring(ctx, req, &sqe)) { - __io_req_do_free(req); - break; - } + io_init_req(ctx, req, sqe); + io_consume_sqe(ctx); /* will complete beyond this point, count as submitted */ submitted++; @@ -5962,6 +5999,7 @@ static int io_sq_thread(void *data) } if (current->task_works) { task_work_run(); + finish_wait(&ctx->sqo_wait, &wait); continue; } if (signal_pending(current)) @@ -6124,43 +6162,36 @@ static void io_file_ref_kill(struct percpu_ref *ref) complete(&data->done); } -static void io_file_ref_exit_and_free(struct work_struct *work) -{ - struct fixed_file_data *data; - - data = container_of(work, struct fixed_file_data, ref_work); - - /* - * Ensure any percpu-ref atomic switch callback has run, it could have - * been in progress when the files were being unregistered. Once - * that's done, we can safely exit and free the ref and containing - * data structure. - */ - rcu_barrier(); - percpu_ref_exit(&data->refs); - kfree(data); -} - static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { struct fixed_file_data *data = ctx->file_data; + struct fixed_file_ref_node *ref_node = NULL; unsigned nr_tables, i; + unsigned long flags; if (!data) return -ENXIO; - percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill); - flush_work(&data->ref_work); + spin_lock_irqsave(&data->lock, flags); + if (!list_empty(&data->ref_list)) + ref_node = list_first_entry(&data->ref_list, + struct fixed_file_ref_node, node); + spin_unlock_irqrestore(&data->lock, flags); + if (ref_node) + percpu_ref_kill(&ref_node->refs); + + percpu_ref_kill(&data->refs); + + /* wait for all refs nodes to complete */ wait_for_completion(&data->done); - io_ring_file_ref_flush(data); __io_sqe_files_unregister(ctx); nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE); for (i = 0; i < nr_tables; i++) kfree(data->table[i].files); kfree(data->table); - INIT_WORK(&data->ref_work, io_file_ref_exit_and_free); - queue_work(system_wq, &data->ref_work); + percpu_ref_exit(&data->refs); + kfree(data); ctx->file_data = NULL; ctx->nr_user_files = 0; return 0; @@ -6204,13 +6235,6 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) struct sk_buff *skb; int i, nr_files; - if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { - unsigned long inflight = ctx->user->unix_inflight + nr; - - if (inflight > task_rlimit(current, RLIMIT_NOFILE)) - return -EMFILE; - } - fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); if (!fpl) return -ENOMEM; @@ -6385,46 +6409,72 @@ static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file) } struct io_file_put { - struct llist_node llist; + struct list_head list; struct file *file; }; -static void io_ring_file_ref_flush(struct fixed_file_data *data) +static void io_file_put_work(struct work_struct *work) { + struct fixed_file_ref_node *ref_node; + struct fixed_file_data *file_data; + struct io_ring_ctx *ctx; struct io_file_put *pfile, *tmp; - struct llist_node *node; + unsigned long flags; - while ((node = llist_del_all(&data->put_llist)) != NULL) { - llist_for_each_entry_safe(pfile, tmp, node, llist) { - io_ring_file_put(data->ctx, pfile->file); - kfree(pfile); - } + ref_node = container_of(work, struct fixed_file_ref_node, work); + file_data = ref_node->file_data; + ctx = file_data->ctx; + + list_for_each_entry_safe(pfile, tmp, &ref_node->file_list, list) { + list_del_init(&pfile->list); + io_ring_file_put(ctx, pfile->file); + kfree(pfile); } + + spin_lock_irqsave(&file_data->lock, flags); + list_del_init(&ref_node->node); + spin_unlock_irqrestore(&file_data->lock, flags); + + percpu_ref_exit(&ref_node->refs); + kfree(ref_node); + percpu_ref_put(&file_data->refs); } -static void io_ring_file_ref_switch(struct work_struct *work) +static void io_file_data_ref_zero(struct percpu_ref *ref) { - struct fixed_file_data *data; + struct fixed_file_ref_node *ref_node; + + ref_node = container_of(ref, struct fixed_file_ref_node, refs); - data = container_of(work, struct fixed_file_data, ref_work); - io_ring_file_ref_flush(data); - percpu_ref_switch_to_percpu(&data->refs); + queue_work(system_wq, &ref_node->work); } -static void io_file_data_ref_zero(struct percpu_ref *ref) +static struct fixed_file_ref_node *alloc_fixed_file_ref_node( + struct io_ring_ctx *ctx) { - struct fixed_file_data *data; + struct fixed_file_ref_node *ref_node; - data = container_of(ref, struct fixed_file_data, refs); + ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); + if (!ref_node) + return ERR_PTR(-ENOMEM); + + if (percpu_ref_init(&ref_node->refs, io_file_data_ref_zero, + 0, GFP_KERNEL)) { + kfree(ref_node); + return ERR_PTR(-ENOMEM); + } + INIT_LIST_HEAD(&ref_node->node); + INIT_LIST_HEAD(&ref_node->file_list); + INIT_WORK(&ref_node->work, io_file_put_work); + ref_node->file_data = ctx->file_data; + return ref_node; - /* - * We can't safely switch from inside this context, punt to wq. If - * the table ref is going away, the table is being unregistered. - * Don't queue up the async work for that case, the caller will - * handle it. - */ - if (!percpu_ref_is_dying(&data->refs)) - queue_work(system_wq, &data->ref_work); +} + +static void destroy_fixed_file_ref_node(struct fixed_file_ref_node *ref_node) +{ + percpu_ref_exit(&ref_node->refs); + kfree(ref_node); } static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, @@ -6435,6 +6485,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, struct file *file; int fd, ret = 0; unsigned i; + struct fixed_file_ref_node *ref_node; + unsigned long flags; if (ctx->file_data) return -EBUSY; @@ -6448,6 +6500,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -ENOMEM; ctx->file_data->ctx = ctx; init_completion(&ctx->file_data->done); + INIT_LIST_HEAD(&ctx->file_data->ref_list); + spin_lock_init(&ctx->file_data->lock); nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); ctx->file_data->table = kcalloc(nr_tables, @@ -6459,15 +6513,13 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -ENOMEM; } - if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero, + if (percpu_ref_init(&ctx->file_data->refs, io_file_ref_kill, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) { kfree(ctx->file_data->table); kfree(ctx->file_data); ctx->file_data = NULL; return -ENOMEM; } - ctx->file_data->put_llist.first = NULL; - INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch); if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) { percpu_ref_exit(&ctx->file_data->refs); @@ -6530,9 +6582,22 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, } ret = io_sqe_files_scm(ctx); - if (ret) + if (ret) { io_sqe_files_unregister(ctx); + return ret; + } + + ref_node = alloc_fixed_file_ref_node(ctx); + if (IS_ERR(ref_node)) { + io_sqe_files_unregister(ctx); + return PTR_ERR(ref_node); + } + ctx->file_data->cur_refs = &ref_node->refs; + spin_lock_irqsave(&ctx->file_data->lock, flags); + list_add(&ref_node->node, &ctx->file_data->ref_list); + spin_unlock_irqrestore(&ctx->file_data->lock, flags); + percpu_ref_get(&ctx->file_data->refs); return ret; } @@ -6579,30 +6644,21 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, #endif } -static void io_atomic_switch(struct percpu_ref *ref) -{ - struct fixed_file_data *data; - - /* - * Juggle reference to ensure we hit zero, if needed, so we can - * switch back to percpu mode - */ - data = container_of(ref, struct fixed_file_data, refs); - percpu_ref_put(&data->refs); - percpu_ref_get(&data->refs); -} - static int io_queue_file_removal(struct fixed_file_data *data, - struct file *file) + struct file *file) { struct io_file_put *pfile; + struct percpu_ref *refs = data->cur_refs; + struct fixed_file_ref_node *ref_node; pfile = kzalloc(sizeof(*pfile), GFP_KERNEL); if (!pfile) return -ENOMEM; + ref_node = container_of(refs, struct fixed_file_ref_node, refs); pfile->file = file; - llist_add(&pfile->llist, &data->put_llist); + list_add(&pfile->list, &ref_node->file_list); + return 0; } @@ -6611,17 +6667,23 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, unsigned nr_args) { struct fixed_file_data *data = ctx->file_data; - bool ref_switch = false; + struct fixed_file_ref_node *ref_node; struct file *file; __s32 __user *fds; int fd, i, err; __u32 done; + unsigned long flags; + bool needs_switch = false; if (check_add_overflow(up->offset, nr_args, &done)) return -EOVERFLOW; if (done > ctx->nr_user_files) return -EINVAL; + ref_node = alloc_fixed_file_ref_node(ctx); + if (IS_ERR(ref_node)) + return PTR_ERR(ref_node); + done = 0; fds = u64_to_user_ptr(up->fds); while (nr_args) { @@ -6642,7 +6704,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (err) break; table->files[index] = NULL; - ref_switch = true; + needs_switch = true; } if (fd != -1) { file = fget(fd); @@ -6673,11 +6735,19 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, up->offset++; } - if (ref_switch) - percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); + if (needs_switch) { + percpu_ref_kill(data->cur_refs); + spin_lock_irqsave(&data->lock, flags); + list_add(&ref_node->node, &data->ref_list); + data->cur_refs = &ref_node->refs; + spin_unlock_irqrestore(&data->lock, flags); + percpu_ref_get(&ctx->file_data->refs); + } else + destroy_fixed_file_ref_node(ref_node); return done ? done : err; } + static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { @@ -7203,6 +7273,18 @@ static int io_remove_personalities(int id, void *p, void *data) return 0; } +static void io_ring_exit_work(struct work_struct *work) +{ + struct io_ring_ctx *ctx; + + ctx = container_of(work, struct io_ring_ctx, exit_work); + if (ctx->rings) + io_cqring_overflow_flush(ctx, true); + + wait_for_completion(&ctx->completions[0]); + io_ring_ctx_free(ctx); +} + static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); @@ -7230,8 +7312,8 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) if (ctx->rings) io_cqring_overflow_flush(ctx, true); idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx); - wait_for_completion(&ctx->completions[0]); - io_ring_ctx_free(ctx); + INIT_WORK(&ctx->exit_work, io_ring_exit_work); + queue_work(system_wq, &ctx->exit_work); } static int io_uring_release(struct inode *inode, struct file *file) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 7c84c4c027c4..89e21961d1ad 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -302,6 +302,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (!ctx->bio || !is_contig || bio_full(ctx->bio, plen)) { gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); + gfp_t orig_gfp = gfp; int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; if (ctx->bio) @@ -310,6 +311,13 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (ctx->is_readahead) /* same as readahead_gfp_mask */ gfp |= __GFP_NORETRY | __GFP_NOWARN; ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs)); + /* + * If the bio_alloc fails, try it again for a single page to + * avoid having to deal with partial page reads. This emulates + * what do_mpage_readpage does. + */ + if (!ctx->bio) + ctx->bio = bio_alloc(orig_gfp, 1); ctx->bio->bi_opf = REQ_OP_READ; if (ctx->is_readahead) ctx->bio->bi_opf |= REQ_RAHEAD; @@ -503,7 +511,8 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); int iomap_releasepage(struct page *page, gfp_t gfp_mask) { - trace_iomap_releasepage(page->mapping->host, page, 0, 0); + trace_iomap_releasepage(page->mapping->host, page_offset(page), + PAGE_SIZE); /* * mm accommodates an old ext3 case where clean pages might not have had @@ -520,7 +529,7 @@ EXPORT_SYMBOL_GPL(iomap_releasepage); void iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len) { - trace_iomap_invalidatepage(page->mapping->host, page, offset, len); + trace_iomap_invalidatepage(page->mapping->host, offset, len); /* * If we are invalidating the entire page, clear the dirty state from it @@ -974,13 +983,6 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); } -static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, - struct iomap *iomap) -{ - return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, - iomap_sector(iomap, pos & PAGE_MASK), offset, bytes); -} - static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, void *data, struct iomap *iomap, struct iomap *srcmap) @@ -1000,7 +1002,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, bytes = min_t(loff_t, PAGE_SIZE - offset, count); if (IS_DAX(inode)) - status = iomap_dax_zero(pos, offset, bytes, iomap); + status = dax_iomap_zero(pos, offset, bytes, iomap); else status = iomap_zero(inode, pos, offset, bytes, iomap, srcmap); @@ -1519,7 +1521,7 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) u64 end_offset; loff_t offset; - trace_iomap_writepage(inode, page, 0, 0); + trace_iomap_writepage(inode, page_offset(page), PAGE_SIZE); /* * Refuse to write the page out if we are called from reclaim context. diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 23837926c0c5..20dde5aadcdd 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -534,8 +534,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, /* * We are about to drop our additional submission reference, which - * might be the last reference to the dio. There are three three - * different ways we can progress here: + * might be the last reference to the dio. There are three different + * ways we can progress here: * * (a) If this is the last reference we will always complete and free * the dio ourselves. diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 6dc227b8c47e..4df19c66f597 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -41,14 +41,12 @@ DEFINE_EVENT(iomap_readpage_class, name, \ DEFINE_READPAGE_EVENT(iomap_readpage); DEFINE_READPAGE_EVENT(iomap_readpages); -DECLARE_EVENT_CLASS(iomap_page_class, - TP_PROTO(struct inode *inode, struct page *page, unsigned long off, - unsigned int len), - TP_ARGS(inode, page, off, len), +DECLARE_EVENT_CLASS(iomap_range_class, + TP_PROTO(struct inode *inode, unsigned long off, unsigned int len), + TP_ARGS(inode, off, len), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) - __field(pgoff_t, pgoff) __field(loff_t, size) __field(unsigned long, offset) __field(unsigned int, length) @@ -56,29 +54,26 @@ DECLARE_EVENT_CLASS(iomap_page_class, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->pgoff = page_offset(page); __entry->size = i_size_read(inode); __entry->offset = off; __entry->length = len; ), - TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset %lx " "length %x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __entry->pgoff, __entry->size, __entry->offset, __entry->length) ) -#define DEFINE_PAGE_EVENT(name) \ -DEFINE_EVENT(iomap_page_class, name, \ - TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \ - unsigned int len), \ - TP_ARGS(inode, page, off, len)) -DEFINE_PAGE_EVENT(iomap_writepage); -DEFINE_PAGE_EVENT(iomap_releasepage); -DEFINE_PAGE_EVENT(iomap_invalidatepage); +#define DEFINE_RANGE_EVENT(name) \ +DEFINE_EVENT(iomap_range_class, name, \ + TP_PROTO(struct inode *inode, unsigned long off, unsigned int len),\ + TP_ARGS(inode, off, len)) +DEFINE_RANGE_EVENT(iomap_writepage); +DEFINE_RANGE_EVENT(iomap_releasepage); +DEFINE_RANGE_EVENT(iomap_invalidatepage); #define IOMAP_TYPE_STRINGS \ { IOMAP_HOLE, "HOLE" }, \ diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 27373f5792a4..e855d8260433 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -997,9 +997,10 @@ restart_loop: * journalled data) we need to unmap buffer and clear * more bits. We also need to be careful about the check * because the data page mapping can get cleared under - * out hands, which alse need not to clear more bits - * because the page and buffers will be freed and can - * never be reused once we are done with them. + * our hands. Note that if mapping == NULL, we don't + * need to make buffer unmapped because the page is + * already detached from the mapping and buffers cannot + * get reused. */ mapping = READ_ONCE(bh->b_page->mapping); if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) { diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index d0f7a5abd9a9..fc2469a20fed 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -53,6 +53,8 @@ static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc) kn->iattr->ia_ctime = kn->iattr->ia_atime; simple_xattrs_init(&kn->iattr->xattrs); + atomic_set(&kn->iattr->nr_user_xattrs, 0); + atomic_set(&kn->iattr->user_xattr_size, 0); out_unlock: ret = kn->iattr; mutex_unlock(&iattr_mutex); @@ -303,7 +305,7 @@ int kernfs_xattr_set(struct kernfs_node *kn, const char *name, if (!attrs) return -ENOMEM; - return simple_xattr_set(&attrs->xattrs, name, value, size, flags); + return simple_xattr_set(&attrs->xattrs, name, value, size, flags, NULL); } static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, @@ -327,6 +329,86 @@ static int kernfs_vfs_xattr_set(const struct xattr_handler *handler, return kernfs_xattr_set(kn, name, value, size, flags); } +static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, + const char *full_name, + struct simple_xattrs *xattrs, + const void *value, size_t size, int flags) +{ + atomic_t *sz = &kn->iattr->user_xattr_size; + atomic_t *nr = &kn->iattr->nr_user_xattrs; + ssize_t removed_size; + int ret; + + if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) { + ret = -ENOSPC; + goto dec_count_out; + } + + if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) { + ret = -ENOSPC; + goto dec_size_out; + } + + ret = simple_xattr_set(xattrs, full_name, value, size, flags, + &removed_size); + + if (!ret && removed_size >= 0) + size = removed_size; + else if (!ret) + return 0; +dec_size_out: + atomic_sub(size, sz); +dec_count_out: + atomic_dec(nr); + return ret; +} + +static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, + const char *full_name, + struct simple_xattrs *xattrs, + const void *value, size_t size, int flags) +{ + atomic_t *sz = &kn->iattr->user_xattr_size; + atomic_t *nr = &kn->iattr->nr_user_xattrs; + ssize_t removed_size; + int ret; + + ret = simple_xattr_set(xattrs, full_name, value, size, flags, + &removed_size); + + if (removed_size >= 0) { + atomic_sub(removed_size, sz); + atomic_dec(nr); + } + + return ret; +} + +static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *suffix, const void *value, + size_t size, int flags) +{ + const char *full_name = xattr_full_name(handler, suffix); + struct kernfs_node *kn = inode->i_private; + struct kernfs_iattrs *attrs; + + if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR)) + return -EOPNOTSUPP; + + attrs = kernfs_iattrs(kn); + if (!attrs) + return -ENOMEM; + + if (value) + return kernfs_vfs_user_xattr_add(kn, full_name, &attrs->xattrs, + value, size, flags); + else + return kernfs_vfs_user_xattr_rm(kn, full_name, &attrs->xattrs, + value, size, flags); + +} + static const struct xattr_handler kernfs_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = kernfs_vfs_xattr_get, @@ -339,8 +421,15 @@ static const struct xattr_handler kernfs_security_xattr_handler = { .set = kernfs_vfs_xattr_set, }; +static const struct xattr_handler kernfs_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .get = kernfs_vfs_xattr_get, + .set = kernfs_vfs_user_xattr_set, +}; + const struct xattr_handler *kernfs_xattr_handlers[] = { &kernfs_trusted_xattr_handler, &kernfs_security_xattr_handler, + &kernfs_user_xattr_handler, NULL }; diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 2f3c51d55261..7ee97ef59184 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -26,6 +26,8 @@ struct kernfs_iattrs { struct timespec64 ia_ctime; struct simple_xattrs xattrs; + atomic_t nr_user_xattrs; + atomic_t user_xattr_size; }; /* +1 to avoid triggering overflow warning when negating it */ diff --git a/fs/namei.c b/fs/namei.c index db6565c99825..a320371899cf 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -503,9 +503,10 @@ struct nameidata { } *stack, internal[EMBEDDED_LEVELS]; struct filename *name; struct nameidata *saved; - struct inode *link_inode; unsigned root_seq; int dfd; + kuid_t dir_uid; + umode_t dir_mode; } __randomize_layout; static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) @@ -530,52 +531,34 @@ static void restore_nameidata(void) kfree(now->stack); } -static int __nd_alloc_stack(struct nameidata *nd) +static bool nd_alloc_stack(struct nameidata *nd) { struct saved *p; - if (nd->flags & LOOKUP_RCU) { - p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), - GFP_ATOMIC); - if (unlikely(!p)) - return -ECHILD; - } else { - p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), - GFP_KERNEL); - if (unlikely(!p)) - return -ENOMEM; - } + p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), + nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL); + if (unlikely(!p)) + return false; memcpy(p, nd->internal, sizeof(nd->internal)); nd->stack = p; - return 0; + return true; } /** - * path_connected - Verify that a path->dentry is below path->mnt.mnt_root - * @path: nameidate to verify + * path_connected - Verify that a dentry is below mnt.mnt_root * * Rename can sometimes move a file or directory outside of a bind * mount, path_connected allows those cases to be detected. */ -static bool path_connected(const struct path *path) +static bool path_connected(struct vfsmount *mnt, struct dentry *dentry) { - struct vfsmount *mnt = path->mnt; struct super_block *sb = mnt->mnt_sb; /* Bind mounts and multi-root filesystems can have disconnected paths */ if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root)) return true; - return is_subdir(path->dentry, mnt->mnt_root); -} - -static inline int nd_alloc_stack(struct nameidata *nd) -{ - if (likely(nd->depth != EMBEDDED_LEVELS)) - return 0; - if (likely(nd->stack != nd->internal)) - return 0; - return __nd_alloc_stack(nd); + return is_subdir(dentry, mnt->mnt_root); } static void drop_links(struct nameidata *nd) @@ -608,10 +591,9 @@ static void terminate_walk(struct nameidata *nd) } /* path_put is needed afterwards regardless of success or failure */ -static bool legitimize_path(struct nameidata *nd, - struct path *path, unsigned seq) +static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq) { - int res = __legitimize_mnt(path->mnt, nd->m_seq); + int res = __legitimize_mnt(path->mnt, mseq); if (unlikely(res)) { if (res > 0) path->mnt = NULL; @@ -625,6 +607,12 @@ static bool legitimize_path(struct nameidata *nd, return !read_seqcount_retry(&path->dentry->d_seq, seq); } +static inline bool legitimize_path(struct nameidata *nd, + struct path *path, unsigned seq) +{ + return __legitimize_path(path, seq, nd->m_seq); +} + static bool legitimize_links(struct nameidata *nd) { int i; @@ -858,25 +846,6 @@ static int set_root(struct nameidata *nd) return 0; } -static void path_put_conditional(struct path *path, struct nameidata *nd) -{ - dput(path->dentry); - if (path->mnt != nd->path.mnt) - mntput(path->mnt); -} - -static inline void path_to_nameidata(const struct path *path, - struct nameidata *nd) -{ - if (!(nd->flags & LOOKUP_RCU)) { - dput(nd->path.dentry); - if (nd->path.mnt != path->mnt) - mntput(nd->path.mnt); - } - nd->path.mnt = path->mnt; - nd->path.dentry = path->dentry; -} - static int nd_jump_root(struct nameidata *nd) { if (unlikely(nd->flags & LOOKUP_BENEATH)) @@ -969,28 +938,21 @@ int sysctl_protected_regular __read_mostly; * * Returns 0 if following the symlink is allowed, -ve on error. */ -static inline int may_follow_link(struct nameidata *nd) +static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { - const struct inode *inode; - const struct inode *parent; - kuid_t puid; - if (!sysctl_protected_symlinks) return 0; /* Allowed if owner and follower match. */ - inode = nd->link_inode; if (uid_eq(current_cred()->fsuid, inode->i_uid)) return 0; /* Allowed if parent directory not sticky and world-writable. */ - parent = nd->inode; - if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) + if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) return 0; /* Allowed if parent directory and link owner match. */ - puid = parent->i_uid; - if (uid_valid(puid) && uid_eq(puid, inode->i_uid)) + if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, inode->i_uid)) return 0; if (nd->flags & LOOKUP_RCU) @@ -1113,63 +1075,6 @@ static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid, return 0; } -static __always_inline -const char *get_link(struct nameidata *nd) -{ - struct saved *last = nd->stack + nd->depth - 1; - struct dentry *dentry = last->link.dentry; - struct inode *inode = nd->link_inode; - int error; - const char *res; - - if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS)) - return ERR_PTR(-ELOOP); - - if (!(nd->flags & LOOKUP_RCU)) { - touch_atime(&last->link); - cond_resched(); - } else if (atime_needs_update(&last->link, inode)) { - if (unlikely(unlazy_walk(nd))) - return ERR_PTR(-ECHILD); - touch_atime(&last->link); - } - - error = security_inode_follow_link(dentry, inode, - nd->flags & LOOKUP_RCU); - if (unlikely(error)) - return ERR_PTR(error); - - nd->last_type = LAST_BIND; - res = READ_ONCE(inode->i_link); - if (!res) { - const char * (*get)(struct dentry *, struct inode *, - struct delayed_call *); - get = inode->i_op->get_link; - if (nd->flags & LOOKUP_RCU) { - res = get(NULL, inode, &last->done); - if (res == ERR_PTR(-ECHILD)) { - if (unlikely(unlazy_walk(nd))) - return ERR_PTR(-ECHILD); - res = get(dentry, inode, &last->done); - } - } else { - res = get(dentry, inode, &last->done); - } - if (IS_ERR_OR_NULL(res)) - return res; - } - if (*res == '/') { - error = nd_jump_root(nd); - if (unlikely(error)) - return ERR_PTR(error); - while (unlikely(*++res == '/')) - ; - } - if (!*res) - res = NULL; - return res; -} - /* * follow_up - Find the mountpoint of path's vfsmount * @@ -1203,19 +1108,59 @@ int follow_up(struct path *path) } EXPORT_SYMBOL(follow_up); +static bool choose_mountpoint_rcu(struct mount *m, const struct path *root, + struct path *path, unsigned *seqp) +{ + while (mnt_has_parent(m)) { + struct dentry *mountpoint = m->mnt_mountpoint; + + m = m->mnt_parent; + if (unlikely(root->dentry == mountpoint && + root->mnt == &m->mnt)) + break; + if (mountpoint != m->mnt.mnt_root) { + path->mnt = &m->mnt; + path->dentry = mountpoint; + *seqp = read_seqcount_begin(&mountpoint->d_seq); + return true; + } + } + return false; +} + +static bool choose_mountpoint(struct mount *m, const struct path *root, + struct path *path) +{ + bool found; + + rcu_read_lock(); + while (1) { + unsigned seq, mseq = read_seqbegin(&mount_lock); + + found = choose_mountpoint_rcu(m, root, path, &seq); + if (unlikely(!found)) { + if (!read_seqretry(&mount_lock, mseq)) + break; + } else { + if (likely(__legitimize_path(path, seq, mseq))) + break; + rcu_read_unlock(); + path_put(path); + rcu_read_lock(); + } + } + rcu_read_unlock(); + return found; +} + /* * Perform an automount * - return -EISDIR to tell follow_managed() to stop and return the path we * were called with. */ -static int follow_automount(struct path *path, struct nameidata *nd, - bool *need_mntput) +static int follow_automount(struct path *path, int *count, unsigned lookup_flags) { - struct vfsmount *mnt; - int err; - - if (!path->dentry->d_op || !path->dentry->d_op->d_automount) - return -EREMOTE; + struct dentry *dentry = path->dentry; /* We don't want to mount if someone's just doing a stat - * unless they're stat'ing a directory and appended a '/' to @@ -1228,138 +1173,91 @@ static int follow_automount(struct path *path, struct nameidata *nd, * as being automount points. These will need the attentions * of the daemon to instantiate them before they can be used. */ - if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | + if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && - path->dentry->d_inode) + dentry->d_inode) return -EISDIR; - nd->total_link_count++; - if (nd->total_link_count >= 40) + if (count && (*count)++ >= MAXSYMLINKS) return -ELOOP; - mnt = path->dentry->d_op->d_automount(path); - if (IS_ERR(mnt)) { - /* - * The filesystem is allowed to return -EISDIR here to indicate - * it doesn't want to automount. For instance, autofs would do - * this so that its userspace daemon can mount on this dentry. - * - * However, we can only permit this if it's a terminal point in - * the path being looked up; if it wasn't then the remainder of - * the path is inaccessible and we should say so. - */ - if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT)) - return -EREMOTE; - return PTR_ERR(mnt); - } - - if (!mnt) /* mount collision */ - return 0; - - if (!*need_mntput) { - /* lock_mount() may release path->mnt on error */ - mntget(path->mnt); - *need_mntput = true; - } - err = finish_automount(mnt, path); - - switch (err) { - case -EBUSY: - /* Someone else made a mount here whilst we were busy */ - return 0; - case 0: - path_put(path); - path->mnt = mnt; - path->dentry = dget(mnt->mnt_root); - return 0; - default: - return err; - } - + return finish_automount(dentry->d_op->d_automount(path), path); } /* - * Handle a dentry that is managed in some way. - * - Flagged for transit management (autofs) - * - Flagged as mountpoint - * - Flagged as automount point - * - * This may only be called in refwalk mode. - * On success path->dentry is known positive. - * - * Serialization is taken care of in namespace.c + * mount traversal - out-of-line part. One note on ->d_flags accesses - + * dentries are pinned but not locked here, so negative dentry can go + * positive right under us. Use of smp_load_acquire() provides a barrier + * sufficient for ->d_inode and ->d_flags consistency. */ -static int follow_managed(struct path *path, struct nameidata *nd) +static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, + int *count, unsigned lookup_flags) { - struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ - unsigned flags; + struct vfsmount *mnt = path->mnt; bool need_mntput = false; int ret = 0; - /* Given that we're not holding a lock here, we retain the value in a - * local variable for each dentry as we look at it so that we don't see - * the components of that value change under us */ - while (flags = smp_load_acquire(&path->dentry->d_flags), - unlikely(flags & DCACHE_MANAGED_DENTRY)) { + while (flags & DCACHE_MANAGED_DENTRY) { /* Allow the filesystem to manage the transit without i_mutex * being held. */ if (flags & DCACHE_MANAGE_TRANSIT) { - BUG_ON(!path->dentry->d_op); - BUG_ON(!path->dentry->d_op->d_manage); ret = path->dentry->d_op->d_manage(path, false); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } - /* Transit to a mounted filesystem. */ - if (flags & DCACHE_MOUNTED) { + if (flags & DCACHE_MOUNTED) { // something's mounted on it.. struct vfsmount *mounted = lookup_mnt(path); - if (mounted) { + if (mounted) { // ... in our namespace dput(path->dentry); if (need_mntput) mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); + // here we know it's positive + flags = path->dentry->d_flags; need_mntput = true; continue; } - - /* Something is mounted on this dentry in another - * namespace and/or whatever was mounted there in this - * namespace got unmounted before lookup_mnt() could - * get it */ } - /* Handle an automount point */ - if (flags & DCACHE_NEED_AUTOMOUNT) { - ret = follow_automount(path, nd, &need_mntput); - if (ret < 0) - break; - continue; - } + if (!(flags & DCACHE_NEED_AUTOMOUNT)) + break; - /* We didn't change the current path point */ - break; + // uncovered automount point + ret = follow_automount(path, count, lookup_flags); + flags = smp_load_acquire(&path->dentry->d_flags); + if (ret < 0) + break; } - if (need_mntput) { - if (path->mnt == mnt) - mntput(path->mnt); - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - ret = -EXDEV; - else - nd->flags |= LOOKUP_JUMPED; - } - if (ret == -EISDIR || !ret) - ret = 1; - if (ret > 0 && unlikely(d_flags_negative(flags))) + if (ret == -EISDIR) + ret = 0; + // possible if you race with several mount --move + if (need_mntput && path->mnt == mnt) + mntput(path->mnt); + if (!ret && unlikely(d_flags_negative(flags))) ret = -ENOENT; - if (unlikely(ret < 0)) - path_put_conditional(path, nd); + *jumped = need_mntput; return ret; } +static inline int traverse_mounts(struct path *path, bool *jumped, + int *count, unsigned lookup_flags) +{ + unsigned flags = smp_load_acquire(&path->dentry->d_flags); + + /* fastpath */ + if (likely(!(flags & DCACHE_MANAGED_DENTRY))) { + *jumped = false; + if (unlikely(d_flags_negative(flags))) + return -ENOENT; + return 0; + } + return __traverse_mounts(path, flags, jumped, count, lookup_flags); +} + int follow_down_one(struct path *path) { struct vfsmount *mounted; @@ -1376,11 +1274,22 @@ int follow_down_one(struct path *path) } EXPORT_SYMBOL(follow_down_one); -static inline int managed_dentry_rcu(const struct path *path) +/* + * Follow down to the covering mount currently visible to userspace. At each + * point, the filesystem owning that dentry may be queried as to whether the + * caller is permitted to proceed or not. + */ +int follow_down(struct path *path) { - return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ? - path->dentry->d_op->d_manage(path, true) : 0; + struct vfsmount *mnt = path->mnt; + bool jumped; + int ret = traverse_mounts(path, &jumped, NULL, 0); + + if (path->mnt != mnt) + mntput(mnt); + return ret; } +EXPORT_SYMBOL(follow_down); /* * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if @@ -1389,204 +1298,88 @@ static inline int managed_dentry_rcu(const struct path *path) static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, struct inode **inode, unsigned *seqp) { + struct dentry *dentry = path->dentry; + unsigned int flags = dentry->d_flags; + + if (likely(!(flags & DCACHE_MANAGED_DENTRY))) + return true; + + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return false; + for (;;) { - struct mount *mounted; /* * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ - switch (managed_dentry_rcu(path)) { - case -ECHILD: - default: - return false; - case -EISDIR: - return true; - case 0: - break; - } - - if (!d_mountpoint(path->dentry)) - return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); - - mounted = __lookup_mnt(path->mnt, path->dentry); - if (!mounted) - break; - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - return false; - path->mnt = &mounted->mnt; - path->dentry = mounted->mnt.mnt_root; - nd->flags |= LOOKUP_JUMPED; - *seqp = read_seqcount_begin(&path->dentry->d_seq); - /* - * Update the inode too. We don't need to re-check the - * dentry sequence number here after this d_inode read, - * because a mount-point is always pinned. - */ - *inode = path->dentry->d_inode; - } - return !read_seqretry(&mount_lock, nd->m_seq) && - !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); -} - -static int follow_dotdot_rcu(struct nameidata *nd) -{ - struct inode *inode = nd->inode; - - while (1) { - if (path_equal(&nd->path, &nd->root)) { - if (unlikely(nd->flags & LOOKUP_BENEATH)) - return -ECHILD; - break; + if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) { + int res = dentry->d_op->d_manage(path, true); + if (res) + return res == -EISDIR; + flags = dentry->d_flags; } - if (nd->path.dentry != nd->path.mnt->mnt_root) { - struct dentry *old = nd->path.dentry; - struct dentry *parent = old->d_parent; - unsigned seq; - inode = parent->d_inode; - seq = read_seqcount_begin(&parent->d_seq); - if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) - return -ECHILD; - nd->path.dentry = parent; - nd->seq = seq; - if (unlikely(!path_connected(&nd->path))) - return -ECHILD; - break; - } else { - struct mount *mnt = real_mount(nd->path.mnt); - struct mount *mparent = mnt->mnt_parent; - struct dentry *mountpoint = mnt->mnt_mountpoint; - struct inode *inode2 = mountpoint->d_inode; - unsigned seq = read_seqcount_begin(&mountpoint->d_seq); - if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) - return -ECHILD; - if (&mparent->mnt == nd->path.mnt) - break; - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - return -ECHILD; - /* we know that mountpoint was pinned */ - nd->path.dentry = mountpoint; - nd->path.mnt = &mparent->mnt; - inode = inode2; - nd->seq = seq; + if (flags & DCACHE_MOUNTED) { + struct mount *mounted = __lookup_mnt(path->mnt, dentry); + if (mounted) { + path->mnt = &mounted->mnt; + dentry = path->dentry = mounted->mnt.mnt_root; + nd->flags |= LOOKUP_JUMPED; + *seqp = read_seqcount_begin(&dentry->d_seq); + *inode = dentry->d_inode; + /* + * We don't need to re-check ->d_seq after this + * ->d_inode read - there will be an RCU delay + * between mount hash removal and ->mnt_root + * becoming unpinned. + */ + flags = dentry->d_flags; + continue; + } + if (read_seqretry(&mount_lock, nd->m_seq)) + return false; } + return !(flags & DCACHE_NEED_AUTOMOUNT); } - while (unlikely(d_mountpoint(nd->path.dentry))) { - struct mount *mounted; - mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry); - if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) - return -ECHILD; - if (!mounted) - break; - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - return -ECHILD; - nd->path.mnt = &mounted->mnt; - nd->path.dentry = mounted->mnt.mnt_root; - inode = nd->path.dentry->d_inode; - nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); - } - nd->inode = inode; - return 0; } -/* - * Follow down to the covering mount currently visible to userspace. At each - * point, the filesystem owning that dentry may be queried as to whether the - * caller is permitted to proceed or not. - */ -int follow_down(struct path *path) +static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, + struct path *path, struct inode **inode, + unsigned int *seqp) { - unsigned managed; + bool jumped; int ret; - while (managed = READ_ONCE(path->dentry->d_flags), - unlikely(managed & DCACHE_MANAGED_DENTRY)) { - /* Allow the filesystem to manage the transit without i_mutex - * being held. - * - * We indicate to the filesystem if someone is trying to mount - * something here. This gives autofs the chance to deny anyone - * other than its daemon the right to mount on its - * superstructure. - * - * The filesystem may sleep at this point. - */ - if (managed & DCACHE_MANAGE_TRANSIT) { - BUG_ON(!path->dentry->d_op); - BUG_ON(!path->dentry->d_op->d_manage); - ret = path->dentry->d_op->d_manage(path, false); - if (ret < 0) - return ret == -EISDIR ? 0 : ret; - } - - /* Transit to a mounted filesystem. */ - if (managed & DCACHE_MOUNTED) { - struct vfsmount *mounted = lookup_mnt(path); - if (!mounted) - break; - dput(path->dentry); - mntput(path->mnt); - path->mnt = mounted; - path->dentry = dget(mounted->mnt_root); - continue; - } - - /* Don't handle automount points here */ - break; - } - return 0; -} -EXPORT_SYMBOL(follow_down); - -/* - * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() - */ -static void follow_mount(struct path *path) -{ - while (d_mountpoint(path->dentry)) { - struct vfsmount *mounted = lookup_mnt(path); - if (!mounted) - break; - dput(path->dentry); - mntput(path->mnt); - path->mnt = mounted; - path->dentry = dget(mounted->mnt_root); + path->mnt = nd->path.mnt; + path->dentry = dentry; + if (nd->flags & LOOKUP_RCU) { + unsigned int seq = *seqp; + if (unlikely(!*inode)) + return -ENOENT; + if (likely(__follow_mount_rcu(nd, path, inode, seqp))) + return 0; + if (unlazy_child(nd, dentry, seq)) + return -ECHILD; + // *path might've been clobbered by __follow_mount_rcu() + path->mnt = nd->path.mnt; + path->dentry = dentry; } -} - -static int path_parent_directory(struct path *path) -{ - struct dentry *old = path->dentry; - /* rare case of legitimate dget_parent()... */ - path->dentry = dget_parent(path->dentry); - dput(old); - if (unlikely(!path_connected(path))) - return -ENOENT; - return 0; -} - -static int follow_dotdot(struct nameidata *nd) -{ - while (1) { - if (path_equal(&nd->path, &nd->root)) { - if (unlikely(nd->flags & LOOKUP_BENEATH)) - return -EXDEV; - break; - } - if (nd->path.dentry != nd->path.mnt->mnt_root) { - int ret = path_parent_directory(&nd->path); - if (ret) - return ret; - break; - } - if (!follow_up(&nd->path)) - break; + ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); + if (jumped) { if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - return -EXDEV; + ret = -EXDEV; + else + nd->flags |= LOOKUP_JUMPED; } - follow_mount(&nd->path); - nd->inode = nd->path.dentry->d_inode; - return 0; + if (unlikely(ret)) { + dput(path->dentry); + if (path->mnt != nd->path.mnt) + mntput(path->mnt); + } else { + *inode = d_backing_inode(path->dentry); + *seqp = 0; /* out of RCU mode, so the value doesn't matter */ + } + return ret; } /* @@ -1643,14 +1436,12 @@ static struct dentry *__lookup_hash(const struct qstr *name, return dentry; } -static int lookup_fast(struct nameidata *nd, - struct path *path, struct inode **inode, - unsigned *seqp) +static struct dentry *lookup_fast(struct nameidata *nd, + struct inode **inode, + unsigned *seqp) { - struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; int status = 1; - int err; /* * Rename seqlock is not required here because in the off chance @@ -1659,12 +1450,11 @@ static int lookup_fast(struct nameidata *nd, */ if (nd->flags & LOOKUP_RCU) { unsigned seq; - bool negative; dentry = __d_lookup_rcu(parent, &nd->last, &seq); if (unlikely(!dentry)) { if (unlazy_walk(nd)) - return -ECHILD; - return 0; + return ERR_PTR(-ECHILD); + return NULL; } /* @@ -1672,9 +1462,8 @@ static int lookup_fast(struct nameidata *nd, * the dentry name information from lookup. */ *inode = d_backing_inode(dentry); - negative = d_is_negative(dentry); if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) - return -ECHILD; + return ERR_PTR(-ECHILD); /* * This sequence count validates that the parent had no @@ -1684,46 +1473,30 @@ static int lookup_fast(struct nameidata *nd, * enough, we can use __read_seqcount_retry here. */ if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq))) - return -ECHILD; + return ERR_PTR(-ECHILD); *seqp = seq; status = d_revalidate(dentry, nd->flags); - if (likely(status > 0)) { - /* - * Note: do negative dentry check after revalidation in - * case that drops it. - */ - if (unlikely(negative)) - return -ENOENT; - path->mnt = mnt; - path->dentry = dentry; - if (likely(__follow_mount_rcu(nd, path, inode, seqp))) - return 1; - } + if (likely(status > 0)) + return dentry; if (unlazy_child(nd, dentry, seq)) - return -ECHILD; + return ERR_PTR(-ECHILD); if (unlikely(status == -ECHILD)) /* we'd been told to redo it in non-rcu mode */ status = d_revalidate(dentry, nd->flags); } else { dentry = __d_lookup(parent, &nd->last); if (unlikely(!dentry)) - return 0; + return NULL; status = d_revalidate(dentry, nd->flags); } if (unlikely(status <= 0)) { if (!status) d_invalidate(dentry); dput(dentry); - return status; + return ERR_PTR(status); } - - path->mnt = mnt; - path->dentry = dentry; - err = follow_managed(path, nd); - if (likely(err > 0)) - *inode = d_backing_inode(path->dentry); - return err; + return dentry; } /* Fast lookup failed, do it the slow way */ @@ -1788,81 +1561,107 @@ static inline int may_lookup(struct nameidata *nd) return inode_permission(nd->inode, MAY_EXEC); } -static inline int handle_dots(struct nameidata *nd, int type) +static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) { - if (type == LAST_DOTDOT) { - int error = 0; + if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) + return -ELOOP; - if (!nd->root.mnt) { - error = set_root(nd); - if (error) - return error; - } - if (nd->flags & LOOKUP_RCU) - error = follow_dotdot_rcu(nd); - else - error = follow_dotdot(nd); - if (error) - return error; + if (likely(nd->depth != EMBEDDED_LEVELS)) + return 0; + if (likely(nd->stack != nd->internal)) + return 0; + if (likely(nd_alloc_stack(nd))) + return 0; - if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { - /* - * If there was a racing rename or mount along our - * path, then we can't be sure that ".." hasn't jumped - * above nd->root (and so userspace should retry or use - * some fallback). - */ - smp_rmb(); - if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))) - return -EAGAIN; - if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))) - return -EAGAIN; - } + if (nd->flags & LOOKUP_RCU) { + // we need to grab link before we do unlazy. And we can't skip + // unlazy even if we fail to grab the link - cleanup needs it + bool grabbed_link = legitimize_path(nd, link, seq); + + if (unlazy_walk(nd) != 0 || !grabbed_link) + return -ECHILD; + + if (nd_alloc_stack(nd)) + return 0; } - return 0; + return -ENOMEM; } -static int pick_link(struct nameidata *nd, struct path *link, - struct inode *inode, unsigned seq) +enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; + +static const char *pick_link(struct nameidata *nd, struct path *link, + struct inode *inode, unsigned seq, int flags) { - int error; struct saved *last; - if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) { - path_to_nameidata(link, nd); - return -ELOOP; - } - if (!(nd->flags & LOOKUP_RCU)) { - if (link->mnt == nd->path.mnt) - mntget(link->mnt); - } - error = nd_alloc_stack(nd); + const char *res; + int error = reserve_stack(nd, link, seq); + if (unlikely(error)) { - if (error == -ECHILD) { - if (unlikely(!legitimize_path(nd, link, seq))) { - drop_links(nd); - nd->depth = 0; - nd->flags &= ~LOOKUP_RCU; - nd->path.mnt = NULL; - nd->path.dentry = NULL; - rcu_read_unlock(); - } else if (likely(unlazy_walk(nd)) == 0) - error = nd_alloc_stack(nd); - } - if (error) { + if (!(nd->flags & LOOKUP_RCU)) path_put(link); - return error; - } + return ERR_PTR(error); } - last = nd->stack + nd->depth++; last->link = *link; clear_delayed_call(&last->done); - nd->link_inode = inode; last->seq = seq; - return 1; -} -enum {WALK_FOLLOW = 1, WALK_MORE = 2}; + if (flags & WALK_TRAILING) { + error = may_follow_link(nd, inode); + if (unlikely(error)) + return ERR_PTR(error); + } + + if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS)) + return ERR_PTR(-ELOOP); + + if (!(nd->flags & LOOKUP_RCU)) { + touch_atime(&last->link); + cond_resched(); + } else if (atime_needs_update(&last->link, inode)) { + if (unlikely(unlazy_walk(nd))) + return ERR_PTR(-ECHILD); + touch_atime(&last->link); + } + + error = security_inode_follow_link(link->dentry, inode, + nd->flags & LOOKUP_RCU); + if (unlikely(error)) + return ERR_PTR(error); + + res = READ_ONCE(inode->i_link); + if (!res) { + const char * (*get)(struct dentry *, struct inode *, + struct delayed_call *); + get = inode->i_op->get_link; + if (nd->flags & LOOKUP_RCU) { + res = get(NULL, inode, &last->done); + if (res == ERR_PTR(-ECHILD)) { + if (unlikely(unlazy_walk(nd))) + return ERR_PTR(-ECHILD); + res = get(link->dentry, inode, &last->done); + } + } else { + res = get(link->dentry, inode, &last->done); + } + if (!res) + goto all_done; + if (IS_ERR(res)) + return res; + } + if (*res == '/') { + error = nd_jump_root(nd); + if (unlikely(error)) + return ERR_PTR(error); + while (unlikely(*++res == '/')) + ; + } + if (*res) + return res; +all_done: // pure jump + put_link(nd); + return NULL; +} /* * Do we need to follow links? We _really_ want to be able @@ -1870,63 +1669,187 @@ enum {WALK_FOLLOW = 1, WALK_MORE = 2}; * so we keep a cache of "no, this doesn't need follow_link" * for the common case. */ -static inline int step_into(struct nameidata *nd, struct path *path, - int flags, struct inode *inode, unsigned seq) +static const char *step_into(struct nameidata *nd, int flags, + struct dentry *dentry, struct inode *inode, unsigned seq) { - if (!(flags & WALK_MORE) && nd->depth) - put_link(nd); - if (likely(!d_is_symlink(path->dentry)) || - !(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) { + struct path path; + int err = handle_mounts(nd, dentry, &path, &inode, &seq); + + if (err < 0) + return ERR_PTR(err); + if (likely(!d_is_symlink(path.dentry)) || + ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) || + (flags & WALK_NOFOLLOW)) { /* not a symlink or should not follow */ - path_to_nameidata(path, nd); + if (!(nd->flags & LOOKUP_RCU)) { + dput(nd->path.dentry); + if (nd->path.mnt != path.mnt) + mntput(nd->path.mnt); + } + nd->path = path; nd->inode = inode; nd->seq = seq; - return 0; + return NULL; } - /* make sure that d_is_symlink above matches inode */ if (nd->flags & LOOKUP_RCU) { - if (read_seqcount_retry(&path->dentry->d_seq, seq)) - return -ECHILD; + /* make sure that d_is_symlink above matches inode */ + if (read_seqcount_retry(&path.dentry->d_seq, seq)) + return ERR_PTR(-ECHILD); + } else { + if (path.mnt == nd->path.mnt) + mntget(path.mnt); } - return pick_link(nd, path, inode, seq); + return pick_link(nd, &path, inode, seq, flags); } -static int walk_component(struct nameidata *nd, int flags) +static struct dentry *follow_dotdot_rcu(struct nameidata *nd, + struct inode **inodep, + unsigned *seqp) { - struct path path; + struct dentry *parent, *old; + + if (path_equal(&nd->path, &nd->root)) + goto in_root; + if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { + struct path path; + unsigned seq; + if (!choose_mountpoint_rcu(real_mount(nd->path.mnt), + &nd->root, &path, &seq)) + goto in_root; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return ERR_PTR(-ECHILD); + nd->path = path; + nd->inode = path.dentry->d_inode; + nd->seq = seq; + if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + return ERR_PTR(-ECHILD); + /* we know that mountpoint was pinned */ + } + old = nd->path.dentry; + parent = old->d_parent; + *inodep = parent->d_inode; + *seqp = read_seqcount_begin(&parent->d_seq); + if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) + return ERR_PTR(-ECHILD); + if (unlikely(!path_connected(nd->path.mnt, parent))) + return ERR_PTR(-ECHILD); + return parent; +in_root: + if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + return ERR_PTR(-ECHILD); + if (unlikely(nd->flags & LOOKUP_BENEATH)) + return ERR_PTR(-ECHILD); + return NULL; +} + +static struct dentry *follow_dotdot(struct nameidata *nd, + struct inode **inodep, + unsigned *seqp) +{ + struct dentry *parent; + + if (path_equal(&nd->path, &nd->root)) + goto in_root; + if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { + struct path path; + + if (!choose_mountpoint(real_mount(nd->path.mnt), + &nd->root, &path)) + goto in_root; + path_put(&nd->path); + nd->path = path; + nd->inode = path.dentry->d_inode; + if (unlikely(nd->flags & LOOKUP_NO_XDEV)) + return ERR_PTR(-EXDEV); + } + /* rare case of legitimate dget_parent()... */ + parent = dget_parent(nd->path.dentry); + if (unlikely(!path_connected(nd->path.mnt, parent))) { + dput(parent); + return ERR_PTR(-ENOENT); + } + *seqp = 0; + *inodep = parent->d_inode; + return parent; + +in_root: + if (unlikely(nd->flags & LOOKUP_BENEATH)) + return ERR_PTR(-EXDEV); + dget(nd->path.dentry); + return NULL; +} + +static const char *handle_dots(struct nameidata *nd, int type) +{ + if (type == LAST_DOTDOT) { + const char *error = NULL; + struct dentry *parent; + struct inode *inode; + unsigned seq; + + if (!nd->root.mnt) { + error = ERR_PTR(set_root(nd)); + if (error) + return error; + } + if (nd->flags & LOOKUP_RCU) + parent = follow_dotdot_rcu(nd, &inode, &seq); + else + parent = follow_dotdot(nd, &inode, &seq); + if (IS_ERR(parent)) + return ERR_CAST(parent); + if (unlikely(!parent)) + error = step_into(nd, WALK_NOFOLLOW, + nd->path.dentry, nd->inode, nd->seq); + else + error = step_into(nd, WALK_NOFOLLOW, + parent, inode, seq); + if (unlikely(error)) + return error; + + if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { + /* + * If there was a racing rename or mount along our + * path, then we can't be sure that ".." hasn't jumped + * above nd->root (and so userspace should retry or use + * some fallback). + */ + smp_rmb(); + if (unlikely(__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))) + return ERR_PTR(-EAGAIN); + if (unlikely(__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))) + return ERR_PTR(-EAGAIN); + } + } + return NULL; +} + +static const char *walk_component(struct nameidata *nd, int flags) +{ + struct dentry *dentry; struct inode *inode; unsigned seq; - int err; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and * parent relationships. */ if (unlikely(nd->last_type != LAST_NORM)) { - err = handle_dots(nd, nd->last_type); if (!(flags & WALK_MORE) && nd->depth) put_link(nd); - return err; + return handle_dots(nd, nd->last_type); } - err = lookup_fast(nd, &path, &inode, &seq); - if (unlikely(err <= 0)) { - if (err < 0) - return err; - path.dentry = lookup_slow(&nd->last, nd->path.dentry, - nd->flags); - if (IS_ERR(path.dentry)) - return PTR_ERR(path.dentry); - - path.mnt = nd->path.mnt; - err = follow_managed(&path, nd); - if (unlikely(err < 0)) - return err; - - seq = 0; /* we are already out of RCU mode */ - inode = d_backing_inode(path.dentry); + dentry = lookup_fast(nd, &inode, &seq); + if (IS_ERR(dentry)) + return ERR_CAST(dentry); + if (unlikely(!dentry)) { + dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags); + if (IS_ERR(dentry)) + return ERR_CAST(dentry); } - - return step_into(nd, &path, flags, inode, seq); + if (!(flags & WALK_MORE) && nd->depth) + put_link(nd); + return step_into(nd, flags, dentry, inode, seq); } /* @@ -2167,8 +2090,11 @@ static inline u64 hash_name(const void *salt, const char *name) */ static int link_path_walk(const char *name, struct nameidata *nd) { + int depth = 0; // depth <= nd->depth int err; + nd->last_type = LAST_ROOT; + nd->flags |= LOOKUP_PARENT; if (IS_ERR(name)) return PTR_ERR(name); while (*name=='/') @@ -2178,6 +2104,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) /* At this point we know we have a real path component. */ for(;;) { + const char *link; u64 hash_len; int type; @@ -2227,36 +2154,27 @@ static int link_path_walk(const char *name, struct nameidata *nd) } while (unlikely(*name == '/')); if (unlikely(!*name)) { OK: - /* pathname body, done */ - if (!nd->depth) - return 0; - name = nd->stack[nd->depth - 1].name; - /* trailing symlink, done */ - if (!name) + /* pathname or trailing symlink, done */ + if (!depth) { + nd->dir_uid = nd->inode->i_uid; + nd->dir_mode = nd->inode->i_mode; + nd->flags &= ~LOOKUP_PARENT; return 0; + } /* last component of nested symlink */ - err = walk_component(nd, WALK_FOLLOW); + name = nd->stack[--depth].name; + link = walk_component(nd, 0); } else { /* not the last component */ - err = walk_component(nd, WALK_FOLLOW | WALK_MORE); + link = walk_component(nd, WALK_MORE); } - if (err < 0) - return err; - - if (err) { - const char *s = get_link(nd); - - if (IS_ERR(s)) - return PTR_ERR(s); - err = 0; - if (unlikely(!s)) { - /* jumped */ - put_link(nd); - } else { - nd->stack[nd->depth - 1].name = name; - name = s; - continue; - } + if (unlikely(link)) { + if (IS_ERR(link)) + return PTR_ERR(link); + /* a symlink to follow */ + nd->stack[depth++].name = name; + name = link; + continue; } if (unlikely(!d_can_lookup(nd->path.dentry))) { if (nd->flags & LOOKUP_RCU) { @@ -2279,8 +2197,7 @@ static const char *path_init(struct nameidata *nd, unsigned flags) if (flags & LOOKUP_RCU) rcu_read_lock(); - nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT; + nd->flags = flags | LOOKUP_JUMPED; nd->depth = 0; nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); @@ -2370,54 +2287,20 @@ static const char *path_init(struct nameidata *nd, unsigned flags) return s; } -static const char *trailing_symlink(struct nameidata *nd) -{ - const char *s; - int error = may_follow_link(nd); - if (unlikely(error)) - return ERR_PTR(error); - nd->flags |= LOOKUP_PARENT; - nd->stack[0].name = NULL; - s = get_link(nd); - return s ? s : ""; -} - -static inline int lookup_last(struct nameidata *nd) +static inline const char *lookup_last(struct nameidata *nd) { if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; - nd->flags &= ~LOOKUP_PARENT; - return walk_component(nd, 0); + return walk_component(nd, WALK_TRAILING); } static int handle_lookup_down(struct nameidata *nd) { - struct path path = nd->path; - struct inode *inode = nd->inode; - unsigned seq = nd->seq; - int err; - - if (nd->flags & LOOKUP_RCU) { - /* - * don't bother with unlazy_walk on failure - we are - * at the very beginning of walk, so we lose nothing - * if we simply redo everything in non-RCU mode - */ - if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq))) - return -ECHILD; - } else { - dget(path.dentry); - err = follow_managed(&path, nd); - if (unlikely(err < 0)) - return err; - inode = d_backing_inode(path.dentry); - seq = 0; - } - path_to_nameidata(&path, nd); - nd->inode = inode; - nd->seq = seq; - return 0; + if (!(nd->flags & LOOKUP_RCU)) + dget(nd->path.dentry); + return PTR_ERR(step_into(nd, WALK_NOFOLLOW, + nd->path.dentry, nd->inode, nd->seq)); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ @@ -2432,16 +2315,19 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path s = ERR_PTR(err); } - while (!(err = link_path_walk(s, nd)) - && ((err = lookup_last(nd)) > 0)) { - s = trailing_symlink(nd); - } + while (!(err = link_path_walk(s, nd)) && + (s = lookup_last(nd)) != NULL) + ; if (!err) err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) if (!d_can_lookup(nd->path.dentry)) err = -ENOTDIR; + if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { + err = handle_lookup_down(nd); + nd->flags &= ~LOOKUP_JUMPED; // no d_weak_revalidate(), please... + } if (!err) { *path = nd->path; nd->path.mnt = NULL; @@ -2470,7 +2356,8 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags, retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path); if (likely(!retval)) - audit_inode(name, path->dentry, 0); + audit_inode(name, path->dentry, + flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0); restore_nameidata(); putname(name); return retval; @@ -2718,24 +2605,23 @@ int path_pts(struct path *path) /* Find something mounted on "pts" in the same directory as * the input path. */ - struct dentry *child, *parent; - struct qstr this; - int ret; - - ret = path_parent_directory(path); - if (ret) - return ret; + struct dentry *parent = dget_parent(path->dentry); + struct dentry *child; + struct qstr this = QSTR_INIT("pts", 3); - parent = path->dentry; - this.name = "pts"; - this.len = 3; + if (unlikely(!path_connected(path->mnt, parent))) { + dput(parent); + return -ENOENT; + } + dput(path->dentry); + path->dentry = parent; child = d_hash_and_lookup(parent, &this); if (!child) return -ENOENT; path->dentry = child; dput(parent); - follow_mount(path); + follow_down(path); return 0; } #endif @@ -2748,88 +2634,6 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags, } EXPORT_SYMBOL(user_path_at_empty); -/** - * path_mountpoint - look up a path to be umounted - * @nd: lookup context - * @flags: lookup flags - * @path: pointer to container for result - * - * Look up the given name, but don't attempt to revalidate the last component. - * Returns 0 and "path" will be valid on success; Returns error otherwise. - */ -static int -path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path) -{ - const char *s = path_init(nd, flags); - int err; - - while (!(err = link_path_walk(s, nd)) && - (err = lookup_last(nd)) > 0) { - s = trailing_symlink(nd); - } - if (!err && (nd->flags & LOOKUP_RCU)) - err = unlazy_walk(nd); - if (!err) - err = handle_lookup_down(nd); - if (!err) { - *path = nd->path; - nd->path.mnt = NULL; - nd->path.dentry = NULL; - } - terminate_walk(nd); - return err; -} - -static int -filename_mountpoint(int dfd, struct filename *name, struct path *path, - unsigned int flags) -{ - struct nameidata nd; - int error; - if (IS_ERR(name)) - return PTR_ERR(name); - set_nameidata(&nd, dfd, name); - error = path_mountpoint(&nd, flags | LOOKUP_RCU, path); - if (unlikely(error == -ECHILD)) - error = path_mountpoint(&nd, flags, path); - if (unlikely(error == -ESTALE)) - error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path); - if (likely(!error)) - audit_inode(name, path->dentry, AUDIT_INODE_NOEVAL); - restore_nameidata(); - putname(name); - return error; -} - -/** - * user_path_mountpoint_at - lookup a path from userland in order to umount it - * @dfd: directory file descriptor - * @name: pathname from userland - * @flags: lookup flags - * @path: pointer to container to hold result - * - * A umount is a special case for path walking. We're not actually interested - * in the inode in this situation, and ESTALE errors can be a problem. We - * simply want track down the dentry and vfsmount attached at the mountpoint - * and avoid revalidating the last component. - * - * Returns 0 and populates "path" on success. - */ -int -user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags, - struct path *path) -{ - return filename_mountpoint(dfd, getname(name), path, flags); -} - -int -kern_path_mountpoint(int dfd, const char *name, struct path *path, - unsigned int flags) -{ - return filename_mountpoint(dfd, getname_kernel(name), path, flags); -} -EXPORT_SYMBOL(kern_path_mountpoint); - int __check_sticky(struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); @@ -3127,18 +2931,14 @@ static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t m * * Returns an error code otherwise. */ -static int atomic_open(struct nameidata *nd, struct dentry *dentry, - struct path *path, struct file *file, - const struct open_flags *op, - int open_flag, umode_t mode) +static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, + struct file *file, + int open_flag, umode_t mode) { struct dentry *const DENTRY_NOT_SET = (void *) -1UL; struct inode *dir = nd->path.dentry->d_inode; int error; - if (!(~open_flag & (O_EXCL | O_CREAT))) /* both O_EXCL and O_CREAT */ - open_flag &= ~O_TRUNC; - if (nd->flags & LOOKUP_DIRECTORY) open_flag |= O_DIRECTORY; @@ -3149,19 +2949,10 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, d_lookup_done(dentry); if (!error) { if (file->f_mode & FMODE_OPENED) { - /* - * We didn't have the inode before the open, so check open - * permission here. - */ - int acc_mode = op->acc_mode; - if (file->f_mode & FMODE_CREATED) { - WARN_ON(!(open_flag & O_CREAT)); - fsnotify_create(dir, dentry); - acc_mode = 0; + if (unlikely(dentry != file->f_path.dentry)) { + dput(dentry); + dentry = dget(file->f_path.dentry); } - error = may_open(&file->f_path, acc_mode, open_flag); - if (WARN_ON(error > 0)) - error = -EINVAL; } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { error = -EIO; } else { @@ -3169,19 +2960,15 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, dput(dentry); dentry = file->f_path.dentry; } - if (file->f_mode & FMODE_CREATED) - fsnotify_create(dir, dentry); - if (unlikely(d_is_negative(dentry))) { + if (unlikely(d_is_negative(dentry))) error = -ENOENT; - } else { - path->dentry = dentry; - path->mnt = nd->path.mnt; - return 0; - } } } - dput(dentry); - return error; + if (error) { + dput(dentry); + dentry = ERR_PTR(error); + } + return dentry; } /* @@ -3199,10 +2986,9 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, * * An error code is returned on failure. */ -static int lookup_open(struct nameidata *nd, struct path *path, - struct file *file, - const struct open_flags *op, - bool got_write) +static struct dentry *lookup_open(struct nameidata *nd, struct file *file, + const struct open_flags *op, + bool got_write) { struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; @@ -3213,7 +2999,7 @@ static int lookup_open(struct nameidata *nd, struct path *path, DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); if (unlikely(IS_DEADDIR(dir_inode))) - return -ENOENT; + return ERR_PTR(-ENOENT); file->f_mode &= ~FMODE_CREATED; dentry = d_lookup(dir, &nd->last); @@ -3221,7 +3007,7 @@ static int lookup_open(struct nameidata *nd, struct path *path, if (!dentry) { dentry = d_alloc_parallel(dir, &nd->last, &wq); if (IS_ERR(dentry)) - return PTR_ERR(dentry); + return dentry; } if (d_in_lookup(dentry)) break; @@ -3237,7 +3023,7 @@ static int lookup_open(struct nameidata *nd, struct path *path, } if (dentry->d_inode) { /* Cached positive dentry: will open in f_op->open */ - goto out_no_open; + return dentry; } /* @@ -3249,41 +3035,27 @@ static int lookup_open(struct nameidata *nd, struct path *path, * Another problem is returing the "right" error value (e.g. for an * O_EXCL open we want to return EEXIST not EROFS). */ + if (unlikely(!got_write)) + open_flag &= ~O_TRUNC; if (open_flag & O_CREAT) { + if (open_flag & O_EXCL) + open_flag &= ~O_TRUNC; if (!IS_POSIXACL(dir->d_inode)) mode &= ~current_umask(); - if (unlikely(!got_write)) { - create_error = -EROFS; - open_flag &= ~O_CREAT; - if (open_flag & (O_EXCL | O_TRUNC)) - goto no_open; - /* No side effects, safe to clear O_CREAT */ - } else { + if (likely(got_write)) create_error = may_o_create(&nd->path, dentry, mode); - if (create_error) { - open_flag &= ~O_CREAT; - if (open_flag & O_EXCL) - goto no_open; - } - } - } else if ((open_flag & (O_TRUNC|O_WRONLY|O_RDWR)) && - unlikely(!got_write)) { - /* - * No O_CREATE -> atomicity not a requirement -> fall - * back to lookup + open - */ - goto no_open; + else + create_error = -EROFS; } - + if (create_error) + open_flag &= ~O_CREAT; if (dir_inode->i_op->atomic_open) { - error = atomic_open(nd, dentry, path, file, op, open_flag, - mode); - if (unlikely(error == -ENOENT) && create_error) - error = create_error; - return error; + dentry = atomic_open(nd, dentry, file, open_flag, mode); + if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT)) + dentry = ERR_PTR(create_error); + return dentry; } -no_open: if (d_in_lookup(dentry)) { struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry, nd->flags); @@ -3310,78 +3082,60 @@ no_open: open_flag & O_EXCL); if (error) goto out_dput; - fsnotify_create(dir_inode, dentry); } if (unlikely(create_error) && !dentry->d_inode) { error = create_error; goto out_dput; } -out_no_open: - path->dentry = dentry; - path->mnt = nd->path.mnt; - return 0; + return dentry; out_dput: dput(dentry); - return error; + return ERR_PTR(error); } -/* - * Handle the last step of open() - */ -static int do_last(struct nameidata *nd, +static const char *open_last_lookups(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct dentry *dir = nd->path.dentry; - kuid_t dir_uid = nd->inode->i_uid; - umode_t dir_mode = nd->inode->i_mode; int open_flag = op->open_flag; - bool will_truncate = (open_flag & O_TRUNC) != 0; bool got_write = false; - int acc_mode = op->acc_mode; unsigned seq; struct inode *inode; - struct path path; + struct dentry *dentry; + const char *res; int error; - nd->flags &= ~LOOKUP_PARENT; nd->flags |= op->intent; if (nd->last_type != LAST_NORM) { - error = handle_dots(nd, nd->last_type); - if (unlikely(error)) - return error; - goto finish_open; + if (nd->depth) + put_link(nd); + return handle_dots(nd, nd->last_type); } if (!(open_flag & O_CREAT)) { if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; /* we _can_ be in RCU mode here */ - error = lookup_fast(nd, &path, &inode, &seq); - if (likely(error > 0)) + dentry = lookup_fast(nd, &inode, &seq); + if (IS_ERR(dentry)) + return ERR_CAST(dentry); + if (likely(dentry)) goto finish_lookup; - if (error < 0) - return error; - - BUG_ON(nd->inode != dir->d_inode); BUG_ON(nd->flags & LOOKUP_RCU); } else { /* create side of things */ - /* - * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED - * has been cleared when we got to the last component we are - * about to look up - */ - error = complete_walk(nd); - if (error) - return error; - + if (nd->flags & LOOKUP_RCU) { + error = unlazy_walk(nd); + if (unlikely(error)) + return ERR_PTR(error); + } audit_inode(nd->name, dir, AUDIT_INODE_PARENT); /* trailing slashes? */ if (unlikely(nd->last.name[nd->last.len])) - return -EISDIR; + return ERR_PTR(-EISDIR); } if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { @@ -3398,108 +3152,90 @@ static int do_last(struct nameidata *nd, inode_lock(dir->d_inode); else inode_lock_shared(dir->d_inode); - error = lookup_open(nd, &path, file, op, got_write); + dentry = lookup_open(nd, file, op, got_write); + if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED)) + fsnotify_create(dir->d_inode, dentry); if (open_flag & O_CREAT) inode_unlock(dir->d_inode); else inode_unlock_shared(dir->d_inode); - if (error) - goto out; - - if (file->f_mode & FMODE_OPENED) { - if ((file->f_mode & FMODE_CREATED) || - !S_ISREG(file_inode(file)->i_mode)) - will_truncate = false; - - audit_inode(nd->name, file->f_path.dentry, 0); - goto opened; - } + if (got_write) + mnt_drop_write(nd->path.mnt); - if (file->f_mode & FMODE_CREATED) { - /* Don't check for write permission, don't truncate */ - open_flag &= ~O_TRUNC; - will_truncate = false; - acc_mode = 0; - path_to_nameidata(&path, nd); - goto finish_open_created; - } + if (IS_ERR(dentry)) + return ERR_CAST(dentry); - /* - * If atomic_open() acquired write access it is dropped now due to - * possible mount and symlink following (this might be optimized away if - * necessary...) - */ - if (got_write) { - mnt_drop_write(nd->path.mnt); - got_write = false; + if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) { + dput(nd->path.dentry); + nd->path.dentry = dentry; + return NULL; } - error = follow_managed(&path, nd); - if (unlikely(error < 0)) - return error; +finish_lookup: + if (nd->depth) + put_link(nd); + res = step_into(nd, WALK_TRAILING, dentry, inode, seq); + if (unlikely(res)) + nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); + return res; +} - /* - * create/update audit record if it already exists. - */ - audit_inode(nd->name, path.dentry, 0); +/* + * Handle the last step of open() + */ +static int do_open(struct nameidata *nd, + struct file *file, const struct open_flags *op) +{ + int open_flag = op->open_flag; + bool do_truncate; + int acc_mode; + int error; - if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) { - path_to_nameidata(&path, nd); - return -EEXIST; + if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) { + error = complete_walk(nd); + if (error) + return error; } - - seq = 0; /* out of RCU mode, so the value doesn't matter */ - inode = d_backing_inode(path.dentry); -finish_lookup: - error = step_into(nd, &path, 0, inode, seq); - if (unlikely(error)) - return error; -finish_open: - /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ - error = complete_walk(nd); - if (error) - return error; - audit_inode(nd->name, nd->path.dentry, 0); + if (!(file->f_mode & FMODE_CREATED)) + audit_inode(nd->name, nd->path.dentry, 0); if (open_flag & O_CREAT) { - error = -EISDIR; + if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED)) + return -EEXIST; if (d_is_dir(nd->path.dentry)) - goto out; - error = may_create_in_sticky(dir_mode, dir_uid, + return -EISDIR; + error = may_create_in_sticky(nd->dir_mode, nd->dir_uid, d_backing_inode(nd->path.dentry)); if (unlikely(error)) - goto out; + return error; } - error = -ENOTDIR; if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) - goto out; - if (!d_is_reg(nd->path.dentry)) - will_truncate = false; + return -ENOTDIR; - if (will_truncate) { + do_truncate = false; + acc_mode = op->acc_mode; + if (file->f_mode & FMODE_CREATED) { + /* Don't check for write permission, don't truncate */ + open_flag &= ~O_TRUNC; + acc_mode = 0; + } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) { error = mnt_want_write(nd->path.mnt); if (error) - goto out; - got_write = true; + return error; + do_truncate = true; } -finish_open_created: error = may_open(&nd->path, acc_mode, open_flag); - if (error) - goto out; - BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ - error = vfs_open(&nd->path, file); - if (error) - goto out; -opened: - error = ima_file_check(file, op->acc_mode); - if (!error && will_truncate) + if (!error && !(file->f_mode & FMODE_OPENED)) + error = vfs_open(&nd->path, file); + if (!error) + error = ima_file_check(file, op->acc_mode); + if (!error && do_truncate) error = handle_truncate(file); -out: if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; } - if (got_write) + if (do_truncate) mnt_drop_write(nd->path.mnt); return error; } @@ -3604,10 +3340,10 @@ static struct file *path_openat(struct nameidata *nd, } else { const char *s = path_init(nd, flags); while (!(error = link_path_walk(s, nd)) && - (error = do_last(nd, file, op)) > 0) { - nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); - s = trailing_symlink(nd); - } + (s = open_last_lookups(nd, file, op)) != NULL) + ; + if (!error) + error = do_open(nd, file, op); terminate_walk(nd); } if (likely(!error)) { diff --git a/fs/namespace.c b/fs/namespace.c index 85b5f7bea82e..a28e4db075ed 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1669,7 +1669,7 @@ int ksys_umount(char __user *name, int flags) struct path path; struct mount *mnt; int retval; - int lookup_flags = 0; + int lookup_flags = LOOKUP_MOUNTPOINT; if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) return -EINVAL; @@ -1680,7 +1680,7 @@ int ksys_umount(char __user *name, int flags) if (!(flags & UMOUNT_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; - retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path); + retval = user_path_at(AT_FDCWD, name, lookup_flags, &path); if (retval) goto out; mnt = real_mount(path.mnt); @@ -2697,45 +2697,32 @@ static int do_move_mount_old(struct path *path, const char *old_name) /* * add a mount into a namespace's mount tree */ -static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) +static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, + struct path *path, int mnt_flags) { - struct mountpoint *mp; - struct mount *parent; - int err; + struct mount *parent = real_mount(path->mnt); mnt_flags &= ~MNT_INTERNAL_FLAGS; - mp = lock_mount(path); - if (IS_ERR(mp)) - return PTR_ERR(mp); - - parent = real_mount(path->mnt); - err = -EINVAL; if (unlikely(!check_mnt(parent))) { /* that's acceptable only for automounts done in private ns */ if (!(mnt_flags & MNT_SHRINKABLE)) - goto unlock; + return -EINVAL; /* ... and for those we'd better have mountpoint still alive */ if (!parent->mnt_ns) - goto unlock; + return -EINVAL; } /* Refuse the same filesystem on the same mount point */ - err = -EBUSY; if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path->mnt->mnt_root == path->dentry) - goto unlock; + return -EBUSY; - err = -EINVAL; if (d_is_symlink(newmnt->mnt.mnt_root)) - goto unlock; + return -EINVAL; newmnt->mnt.mnt_flags = mnt_flags; - err = graft_tree(newmnt, parent, mp); - -unlock: - unlock_mount(mp); - return err; + return graft_tree(newmnt, parent, mp); } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); @@ -2748,6 +2735,7 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, unsigned int mnt_flags) { struct vfsmount *mnt; + struct mountpoint *mp; struct super_block *sb = fc->root->d_sb; int error; @@ -2768,7 +2756,13 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, mnt_warn_timestamp_expiry(mountpoint, mnt); - error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags); + mp = lock_mount(mountpoint); + if (IS_ERR(mp)) { + mntput(mnt); + return PTR_ERR(mp); + } + error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags); + unlock_mount(mp); if (error < 0) mntput(mnt); return error; @@ -2829,23 +2823,63 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, int finish_automount(struct vfsmount *m, struct path *path) { - struct mount *mnt = real_mount(m); + struct dentry *dentry = path->dentry; + struct mountpoint *mp; + struct mount *mnt; int err; + + if (!m) + return 0; + if (IS_ERR(m)) + return PTR_ERR(m); + + mnt = real_mount(m); /* The new mount record should have at least 2 refs to prevent it being * expired before we get a chance to add it */ BUG_ON(mnt_get_count(mnt) < 2); if (m->mnt_sb == path->mnt->mnt_sb && - m->mnt_root == path->dentry) { + m->mnt_root == dentry) { err = -ELOOP; - goto fail; + goto discard; } - err = do_add_mount(mnt, path, path->mnt->mnt_flags | MNT_SHRINKABLE); - if (!err) - return 0; -fail: + /* + * we don't want to use lock_mount() - in this case finding something + * that overmounts our mountpoint to be means "quitely drop what we've + * got", not "try to mount it on top". + */ + inode_lock(dentry->d_inode); + namespace_lock(); + if (unlikely(cant_mount(dentry))) { + err = -ENOENT; + goto discard_locked; + } + rcu_read_lock(); + if (unlikely(__lookup_mnt(path->mnt, dentry))) { + rcu_read_unlock(); + err = 0; + goto discard_locked; + } + rcu_read_unlock(); + mp = get_mountpoint(dentry); + if (IS_ERR(mp)) { + err = PTR_ERR(mp); + goto discard_locked; + } + + err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE); + unlock_mount(mp); + if (unlikely(err)) + goto discard; + mntput(m); + return 0; + +discard_locked: + namespace_unlock(); + inode_unlock(dentry->d_inode); +discard: /* remove m from any expiration list it may be on */ if (!list_empty(&mnt->mnt_expire)) { namespace_lock(); diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 690221747b47..d1a0e2c8b1b4 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -476,7 +476,7 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) err = ext_tree_remove(bl, true, 0, LLONG_MAX); WARN_ON(err); - kfree(bl); + kfree_rcu(bl, bl_layout.plh_rcu); } static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode, diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 549350259840..6a2033131c06 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -127,7 +127,9 @@ extern __be32 nfs4_callback_sequence(void *argp, void *resp, #define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9 #define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12 #define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15 -#define RCA4_TYPE_MASK_ALL 0xf31f +#define PNFS_FF_RCA4_TYPE_MASK_READ 16 +#define PNFS_FF_RCA4_TYPE_MASK_RW 17 +#define RCA4_TYPE_MASK_ALL 0x3f31f struct cb_recallanyargs { uint32_t craa_objs_to_keep; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index cd4c6bc81cae..e61dbc9b86ae 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -121,31 +121,31 @@ out: */ static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp, const nfs4_stateid *stateid) + __must_hold(RCU) { struct nfs_server *server; struct inode *inode; struct pnfs_layout_hdr *lo; + rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - list_for_each_entry(lo, &server->layouts, plh_layouts) { + list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) { + if (!pnfs_layout_is_valid(lo)) + continue; if (stateid != NULL && !nfs4_stateid_match_other(stateid, &lo->plh_stateid)) continue; + if (!nfs_sb_active(server->super)) + continue; inode = igrab(lo->plh_inode); - if (!inode) - return ERR_PTR(-EAGAIN); - if (!nfs_sb_active(inode->i_sb)) { - rcu_read_unlock(); - spin_unlock(&clp->cl_lock); - iput(inode); - spin_lock(&clp->cl_lock); - rcu_read_lock(); - return ERR_PTR(-EAGAIN); - } - return inode; + rcu_read_unlock(); + if (inode) + return inode; + nfs_sb_deactive(server->super); + return ERR_PTR(-EAGAIN); } } - + rcu_read_unlock(); return ERR_PTR(-ENOENT); } @@ -163,28 +163,25 @@ static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp, struct inode *inode; struct pnfs_layout_hdr *lo; + rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - list_for_each_entry(lo, &server->layouts, plh_layouts) { + list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) { nfsi = NFS_I(lo->plh_inode); if (nfs_compare_fh(fh, &nfsi->fh)) continue; if (nfsi->layout != lo) continue; + if (!nfs_sb_active(server->super)) + continue; inode = igrab(lo->plh_inode); - if (!inode) - return ERR_PTR(-EAGAIN); - if (!nfs_sb_active(inode->i_sb)) { - rcu_read_unlock(); - spin_unlock(&clp->cl_lock); - iput(inode); - spin_lock(&clp->cl_lock); - rcu_read_lock(); - return ERR_PTR(-EAGAIN); - } - return inode; + rcu_read_unlock(); + if (inode) + return inode; + nfs_sb_deactive(server->super); + return ERR_PTR(-EAGAIN); } } - + rcu_read_unlock(); return ERR_PTR(-ENOENT); } @@ -194,14 +191,9 @@ static struct inode *nfs_layout_find_inode(struct nfs_client *clp, { struct inode *inode; - spin_lock(&clp->cl_lock); - rcu_read_lock(); inode = nfs_layout_find_inode_by_stateid(clp, stateid); if (inode == ERR_PTR(-ENOENT)) inode = nfs_layout_find_inode_by_fh(clp, fh); - rcu_read_unlock(); - spin_unlock(&clp->cl_lock); - return inode; } @@ -280,7 +272,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, goto unlock; } - pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); + pnfs_set_layout_stateid(lo, &args->cbl_stateid, NULL, true); switch (pnfs_mark_matching_lsegs_return(lo, &free_me_list, &args->cbl_range, be32_to_cpu(args->cbl_stateid.seqid))) { @@ -605,6 +597,7 @@ __be32 nfs4_callback_recallany(void *argp, void *resp, struct cb_recallanyargs *args = argp; __be32 status; fmode_t flags = 0; + bool schedule_manager = false; status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); if (!cps->clp) /* set in cb_sequence */ @@ -627,6 +620,18 @@ __be32 nfs4_callback_recallany(void *argp, void *resp, if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_FILE_LAYOUT)) pnfs_recall_all_layouts(cps->clp); + + if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_READ)) { + set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &cps->clp->cl_state); + schedule_manager = true; + } + if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_RW)) { + set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_RW, &cps->clp->cl_state); + schedule_manager = true; + } + if (schedule_manager) + nfs4_schedule_state_manager(cps->clp); + out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 1865322de142..816e1427f17e 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -378,6 +378,18 @@ nfs_inode_detach_delegation(struct inode *inode) } static void +nfs_update_delegation_cred(struct nfs_delegation *delegation, + const struct cred *cred) +{ + const struct cred *old; + + if (cred_fscmp(delegation->cred, cred) != 0) { + old = xchg(&delegation->cred, get_cred(cred)); + put_cred(old); + } +} + +static void nfs_update_inplace_delegation(struct nfs_delegation *delegation, const struct nfs_delegation *update) { @@ -385,8 +397,14 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation, delegation->stateid.seqid = update->stateid.seqid; smp_wmb(); delegation->type = update->type; - if (test_and_clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) + delegation->pagemod_limit = update->pagemod_limit; + if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { + delegation->change_attr = update->change_attr; + nfs_update_delegation_cred(delegation, update->cred); + /* smp_mb__before_atomic() is implicit due to xchg() */ + clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags); atomic_long_inc(&nfs_active_delegations); + } } } @@ -545,21 +563,11 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) return ret; } -/** - * nfs_client_return_marked_delegations - return previously marked delegations - * @clp: nfs_client to process - * - * Note that this function is designed to be called by the state - * manager thread. For this reason, it cannot flush the dirty data, - * since that could deadlock in case of a state recovery error. - * - * Returns zero on success, or a negative errno value. - */ -int nfs_client_return_marked_delegations(struct nfs_client *clp) +static int nfs_server_return_marked_delegations(struct nfs_server *server, + void __always_unused *data) { struct nfs_delegation *delegation; struct nfs_delegation *prev; - struct nfs_server *server; struct inode *inode; struct inode *place_holder = NULL; struct nfs_delegation *place_holder_deleg = NULL; @@ -569,78 +577,79 @@ restart: /* * To avoid quadratic looping we hold a reference * to an inode place_holder. Each time we restart, we - * list nfs_servers from the server of that inode, and - * delegation in the server from the delegations of that - * inode. + * list delegation in the server from the delegations + * of that inode. * prev is an RCU-protected pointer to a delegation which * wasn't marked for return and might be a good choice for * the next place_holder. */ - rcu_read_lock(); prev = NULL; + delegation = NULL; + rcu_read_lock(); if (place_holder) - server = NFS_SERVER(place_holder); - else - server = list_entry_rcu(clp->cl_superblocks.next, - struct nfs_server, client_link); - list_for_each_entry_from_rcu(server, &clp->cl_superblocks, client_link) { - delegation = NULL; - if (place_holder && server == NFS_SERVER(place_holder)) - delegation = rcu_dereference(NFS_I(place_holder)->delegation); - if (!delegation || delegation != place_holder_deleg) - delegation = list_entry_rcu(server->delegations.next, - struct nfs_delegation, super_list); - list_for_each_entry_from_rcu(delegation, &server->delegations, super_list) { - struct inode *to_put = NULL; - - if (!nfs_delegation_need_return(delegation)) { + delegation = rcu_dereference(NFS_I(place_holder)->delegation); + if (!delegation || delegation != place_holder_deleg) + delegation = list_entry_rcu(server->delegations.next, + struct nfs_delegation, super_list); + list_for_each_entry_from_rcu(delegation, &server->delegations, super_list) { + struct inode *to_put = NULL; + + if (test_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags)) + continue; + if (!nfs_delegation_need_return(delegation)) { + if (nfs4_is_valid_delegation(delegation, 0)) prev = delegation; - continue; - } - if (!nfs_sb_active(server->super)) - break; /* continue in outer loop */ - - if (prev) { - struct inode *tmp; - - tmp = nfs_delegation_grab_inode(prev); - if (tmp) { - to_put = place_holder; - place_holder = tmp; - place_holder_deleg = prev; - } - } + continue; + } - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) { - rcu_read_unlock(); - if (to_put) - iput(to_put); - nfs_sb_deactive(server->super); - goto restart; + if (prev) { + struct inode *tmp = nfs_delegation_grab_inode(prev); + if (tmp) { + to_put = place_holder; + place_holder = tmp; + place_holder_deleg = prev; } - delegation = nfs_start_delegation_return_locked(NFS_I(inode)); + } + + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) { rcu_read_unlock(); + iput(to_put); + goto restart; + } + delegation = nfs_start_delegation_return_locked(NFS_I(inode)); + rcu_read_unlock(); - if (to_put) - iput(to_put); + iput(to_put); - err = nfs_end_delegation_return(inode, delegation, 0); - iput(inode); - nfs_sb_deactive(server->super); - cond_resched(); - if (!err) - goto restart; - set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); - if (place_holder) - iput(place_holder); - return err; - } + err = nfs_end_delegation_return(inode, delegation, 0); + iput(inode); + cond_resched(); + if (!err) + goto restart; + set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); + goto out; } rcu_read_unlock(); - if (place_holder) - iput(place_holder); - return 0; +out: + iput(place_holder); + return err; +} + +/** + * nfs_client_return_marked_delegations - return previously marked delegations + * @clp: nfs_client to process + * + * Note that this function is designed to be called by the state + * manager thread. For this reason, it cannot flush the dirty data, + * since that could deadlock in case of a state recovery error. + * + * Returns zero on success, or a negative errno value. + */ +int nfs_client_return_marked_delegations(struct nfs_client *clp) +{ + return nfs_client_for_each_server(clp, + nfs_server_return_marked_delegations, NULL); } /** @@ -1083,53 +1092,51 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp) rcu_read_unlock(); } -/** - * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done - * @clp: nfs_client to process - * - */ -void nfs_delegation_reap_unclaimed(struct nfs_client *clp) +static int nfs_server_reap_unclaimed_delegations(struct nfs_server *server, + void __always_unused *data) { struct nfs_delegation *delegation; - struct nfs_server *server; struct inode *inode; - restart: rcu_read_lock(); - list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - list_for_each_entry_rcu(delegation, &server->delegations, - super_list) { - if (test_bit(NFS_DELEGATION_INODE_FREEING, - &delegation->flags) || - test_bit(NFS_DELEGATION_RETURNING, - &delegation->flags) || - test_bit(NFS_DELEGATION_NEED_RECLAIM, - &delegation->flags) == 0) - continue; - if (!nfs_sb_active(server->super)) - break; /* continue in outer loop */ - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) { - rcu_read_unlock(); - nfs_sb_deactive(server->super); - goto restart; - } - delegation = nfs_start_delegation_return_locked(NFS_I(inode)); - rcu_read_unlock(); - if (delegation != NULL) { - if (nfs_detach_delegation(NFS_I(inode), delegation, - server) != NULL) - nfs_free_delegation(delegation); - /* Match nfs_start_delegation_return_locked */ - nfs_put_delegation(delegation); - } - iput(inode); - nfs_sb_deactive(server->super); - cond_resched(); - goto restart; +restart_locked: + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + if (test_bit(NFS_DELEGATION_INODE_FREEING, + &delegation->flags) || + test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags) || + test_bit(NFS_DELEGATION_NEED_RECLAIM, + &delegation->flags) == 0) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + goto restart_locked; + delegation = nfs_start_delegation_return_locked(NFS_I(inode)); + rcu_read_unlock(); + if (delegation != NULL) { + if (nfs_detach_delegation(NFS_I(inode), delegation, + server) != NULL) + nfs_free_delegation(delegation); + /* Match nfs_start_delegation_return_locked */ + nfs_put_delegation(delegation); } + iput(inode); + cond_resched(); + goto restart; } rcu_read_unlock(); + return 0; +} + +/** + * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done + * @clp: nfs_client to process + * + */ +void nfs_delegation_reap_unclaimed(struct nfs_client *clp) +{ + nfs_client_for_each_server(clp, nfs_server_reap_unclaimed_delegations, + NULL); } static inline bool nfs4_server_rebooted(const struct nfs_client *clp) @@ -1215,62 +1222,61 @@ nfs_delegation_test_free_expired(struct inode *inode, nfs_remove_bad_delegation(inode, stateid); } -/** - * nfs_reap_expired_delegations - reap expired delegations - * @clp: nfs_client to process - * - * Iterates through all the delegations associated with this server and - * checks if they have may have been revoked. This function is usually - * expected to be called in cases where the server may have lost its - * lease. - */ -void nfs_reap_expired_delegations(struct nfs_client *clp) +static int nfs_server_reap_expired_delegations(struct nfs_server *server, + void __always_unused *data) { struct nfs_delegation *delegation; - struct nfs_server *server; struct inode *inode; const struct cred *cred; nfs4_stateid stateid; - restart: rcu_read_lock(); - list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { - list_for_each_entry_rcu(delegation, &server->delegations, - super_list) { - if (test_bit(NFS_DELEGATION_INODE_FREEING, - &delegation->flags) || - test_bit(NFS_DELEGATION_RETURNING, - &delegation->flags) || - test_bit(NFS_DELEGATION_TEST_EXPIRED, - &delegation->flags) == 0) - continue; - if (!nfs_sb_active(server->super)) - break; /* continue in outer loop */ - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) { - rcu_read_unlock(); - nfs_sb_deactive(server->super); - goto restart; - } - cred = get_cred_rcu(delegation->cred); - nfs4_stateid_copy(&stateid, &delegation->stateid); - clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); - rcu_read_unlock(); - nfs_delegation_test_free_expired(inode, &stateid, cred); - put_cred(cred); - if (nfs4_server_rebooted(clp)) { - nfs_inode_mark_test_expired_delegation(server,inode); - iput(inode); - nfs_sb_deactive(server->super); - return; - } +restart_locked: + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + if (test_bit(NFS_DELEGATION_INODE_FREEING, + &delegation->flags) || + test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags) || + test_bit(NFS_DELEGATION_TEST_EXPIRED, + &delegation->flags) == 0) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) + goto restart_locked; + spin_lock(&delegation->lock); + cred = get_cred_rcu(delegation->cred); + nfs4_stateid_copy(&stateid, &delegation->stateid); + spin_unlock(&delegation->lock); + clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); + rcu_read_unlock(); + nfs_delegation_test_free_expired(inode, &stateid, cred); + put_cred(cred); + if (!nfs4_server_rebooted(server->nfs_client)) { iput(inode); - nfs_sb_deactive(server->super); cond_resched(); goto restart; } + nfs_inode_mark_test_expired_delegation(server,inode); + iput(inode); + return -EAGAIN; } rcu_read_unlock(); + return 0; +} + +/** + * nfs_reap_expired_delegations - reap expired delegations + * @clp: nfs_client to process + * + * Iterates through all the delegations associated with this server and + * checks if they have may have been revoked. This function is usually + * expected to be called in cases where the server may have lost its + * lease. + */ +void nfs_reap_expired_delegations(struct nfs_client *clp) +{ + nfs_client_for_each_server(clp, nfs_server_reap_expired_delegations, + NULL); } void nfs_inode_find_delegation_state_and_recover(struct inode *inode, @@ -1359,11 +1365,14 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; - bool ret; + bool ret = false; flags &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); + if (!delegation) + goto out; + spin_lock(&delegation->lock); ret = nfs4_is_valid_delegation(delegation, flags); if (ret) { nfs4_stateid_copy(dst, &delegation->stateid); @@ -1371,6 +1380,8 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, if (cred) *cred = get_cred(delegation->cred); } + spin_unlock(&delegation->lock); +out: rcu_read_unlock(); return ret; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index d4b839b6cf89..5a331da5f55a 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -141,10 +141,9 @@ struct nfs_cache_array { int size; int eof_index; u64 last_cookie; - struct nfs_cache_array_entry array[0]; + struct nfs_cache_array_entry array[]; }; -typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, bool); typedef struct { struct file *file; struct page *page; @@ -153,7 +152,7 @@ typedef struct { u64 *dir_cookie; u64 last_cookie; loff_t current_index; - decode_dirent_t decode; + loff_t prev_index; unsigned long dir_verifier; unsigned long timestamp; @@ -240,6 +239,25 @@ out: return ret; } +static inline +int is_32bit_api(void) +{ +#ifdef CONFIG_COMPAT + return in_compat_syscall(); +#else + return (BITS_PER_LONG == 32); +#endif +} + +static +bool nfs_readdir_use_cookie(const struct file *filp) +{ + if ((filp->f_mode & FMODE_32BITHASH) || + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) + return false; + return true; +} + static int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) { @@ -289,7 +307,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des !nfs_readdir_inode_mapping_valid(nfsi)) { ctx->duped = 0; ctx->attr_gencount = nfsi->attr_gencount; - } else if (new_pos < desc->ctx->pos) { + } else if (new_pos < desc->prev_index) { if (ctx->duped > 0 && ctx->dup_cookie == *desc->dir_cookie) { if (printk_ratelimit()) { @@ -305,7 +323,11 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des ctx->dup_cookie = *desc->dir_cookie; ctx->duped = -1; } - desc->ctx->pos = new_pos; + if (nfs_readdir_use_cookie(desc->file)) + desc->ctx->pos = *desc->dir_cookie; + else + desc->ctx->pos = new_pos; + desc->prev_index = new_pos; desc->cache_entry_index = i; return 0; } @@ -376,9 +398,10 @@ error: static int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct xdr_stream *xdr) { + struct inode *inode = file_inode(desc->file); int error; - error = desc->decode(xdr, entry, desc->plus); + error = NFS_PROTO(inode)->decode_dirent(xdr, entry, desc->plus); if (error) return error; entry->fattr->time_start = desc->timestamp; @@ -756,6 +779,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) if (desc->page_index == 0) { desc->current_index = 0; + desc->prev_index = 0; desc->last_cookie = 0; } do { @@ -786,11 +810,14 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) desc->eof = true; break; } - desc->ctx->pos++; if (i < (array->size-1)) *desc->dir_cookie = array->array[i+1].cookie; else *desc->dir_cookie = array->last_cookie; + if (nfs_readdir_use_cookie(file)) + desc->ctx->pos = *desc->dir_cookie; + else + desc->ctx->pos++; if (ctx->duped != 0) ctx->duped = 1; } @@ -860,9 +887,14 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); - nfs_readdir_descriptor_t my_desc, - *desc = &my_desc; struct nfs_open_dir_context *dir_ctx = file->private_data; + nfs_readdir_descriptor_t my_desc = { + .file = file, + .ctx = ctx, + .dir_cookie = &dir_ctx->dir_cookie, + .plus = nfs_use_readdirplus(inode, ctx), + }, + *desc = &my_desc; int res = 0; dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", @@ -875,14 +907,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) * to either find the entry with the appropriate number or * revalidate the cookie. */ - memset(desc, 0, sizeof(*desc)); - - desc->file = file; - desc->ctx = ctx; - desc->dir_cookie = &dir_ctx->dir_cookie; - desc->decode = NFS_PROTO(inode)->decode_dirent; - desc->plus = nfs_use_readdirplus(inode, ctx); - if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) res = nfs_revalidate_mapping(inode, file->f_mapping); if (res < 0) @@ -954,7 +978,10 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) } if (offset != filp->f_pos) { filp->f_pos = offset; - dir_ctx->dir_cookie = 0; + if (nfs_readdir_use_cookie(filp)) + dir_ctx->dir_cookie = offset; + else + dir_ctx->dir_cookie = 0; dir_ctx->duped = 0; } inode_unlock(inode); @@ -2282,7 +2309,7 @@ static DEFINE_SPINLOCK(nfs_access_lru_lock); static LIST_HEAD(nfs_access_lru_list); static atomic_long_t nfs_access_nr_entries; -static unsigned long nfs_access_max_cachesize = ULONG_MAX; +static unsigned long nfs_access_max_cachesize = 4*1024*1024; module_param(nfs_access_max_cachesize, ulong, 0644); MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length"); @@ -2642,9 +2669,10 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask) status = NFS_PROTO(inode)->access(inode, &cache); if (status != 0) { if (status == -ESTALE) { - nfs_zap_caches(inode); if (!S_ISDIR(inode->i_mode)) - set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + nfs_set_inode_stale(inode); + else + nfs_zap_caches(inode); } goto out; } @@ -2732,14 +2760,7 @@ force_lookup: if (!NFS_PROTO(inode)->access) goto out_notsup; - /* Always try fast lookups first */ - rcu_read_lock(); - res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK); - rcu_read_unlock(); - if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) { - /* Fast lookup failed, try the slow way */ - res = nfs_do_access(inode, cred, mask); - } + res = nfs_do_access(inode, cred, mask); out: if (!res && (mask & MAY_EXEC)) res = nfs_execute_ok(inode, mask); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index b768a0b42e82..a57e7c72c7f4 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -94,7 +94,7 @@ struct nfs_direct_req { #define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ /* for read */ #define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */ - struct nfs_writeverf verf; /* unstable write verifier */ +#define NFS_ODIRECT_DONE INT_MAX /* write verification failed */ }; static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops; @@ -151,106 +151,6 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq, dreq->count = dreq_len; } -/* - * nfs_direct_select_verf - select the right verifier - * @dreq - direct request possibly spanning multiple servers - * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs - * @commit_idx - commit bucket index for the DS - * - * returns the correct verifier to use given the role of the server - */ -static struct nfs_writeverf * -nfs_direct_select_verf(struct nfs_direct_req *dreq, - struct nfs_client *ds_clp, - int commit_idx) -{ - struct nfs_writeverf *verfp = &dreq->verf; - -#ifdef CONFIG_NFS_V4_1 - /* - * pNFS is in use, use the DS verf except commit_through_mds is set - * for layout segment where nbuckets is zero. - */ - if (ds_clp && dreq->ds_cinfo.nbuckets > 0) { - if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets) - verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf; - else - WARN_ON_ONCE(1); - } -#endif - return verfp; -} - - -/* - * nfs_direct_set_hdr_verf - set the write/commit verifier - * @dreq - direct request possibly spanning multiple servers - * @hdr - pageio header to validate against previously seen verfs - * - * Set the server's (MDS or DS) "seen" verifier - */ -static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq, - struct nfs_pgio_header *hdr) -{ - struct nfs_writeverf *verfp; - - verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx); - WARN_ON_ONCE(verfp->committed >= 0); - memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); - WARN_ON_ONCE(verfp->committed < 0); -} - -static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1, - const struct nfs_writeverf *v2) -{ - return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier); -} - -/* - * nfs_direct_cmp_hdr_verf - compare verifier for pgio header - * @dreq - direct request possibly spanning multiple servers - * @hdr - pageio header to validate against previously seen verf - * - * set the server's "seen" verf if not initialized. - * returns result of comparison between @hdr->verf and the "seen" - * verf of the server used by @hdr (DS or MDS) - */ -static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, - struct nfs_pgio_header *hdr) -{ - struct nfs_writeverf *verfp; - - verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx); - if (verfp->committed < 0) { - nfs_direct_set_hdr_verf(dreq, hdr); - return 0; - } - return nfs_direct_cmp_verf(verfp, &hdr->verf); -} - -/* - * nfs_direct_cmp_commit_data_verf - compare verifier for commit data - * @dreq - direct request possibly spanning multiple servers - * @data - commit data to validate against previously seen verf - * - * returns result of comparison between @data->verf and the verf of - * the server used by @data (DS or MDS) - */ -static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, - struct nfs_commit_data *data) -{ - struct nfs_writeverf *verfp; - - verfp = nfs_direct_select_verf(dreq, data->ds_clp, - data->ds_commit_index); - - /* verifier not set so always fail */ - if (verfp->committed < 0 || data->res.verf->committed <= NFS_UNSTABLE) - return 1; - - return nfs_direct_cmp_verf(verfp, data->res.verf); -} - /** * nfs_direct_IO - NFS address space operation for direct I/O * @iocb: target I/O control block @@ -305,7 +205,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void) kref_get(&dreq->kref); init_completion(&dreq->completion); INIT_LIST_HEAD(&dreq->mds_cinfo.list); - dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */ + pnfs_init_ds_commit_info(&dreq->ds_cinfo); INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); spin_lock_init(&dreq->lock); @@ -316,7 +216,7 @@ static void nfs_direct_req_free(struct kref *kref) { struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); - nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo); + pnfs_release_ds_info(&dreq->ds_cinfo, dreq->inode); if (dreq->l_ctx != NULL) nfs_put_lock_context(dreq->l_ctx); if (dreq->ctx != NULL) @@ -571,6 +471,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) l_ctx = nfs_get_lock_context(dreq->ctx); if (IS_ERR(l_ctx)) { result = PTR_ERR(l_ctx); + nfs_direct_req_release(dreq); goto out_release; } dreq->l_ctx = l_ctx; @@ -605,15 +506,30 @@ out: } static void +nfs_direct_join_group(struct list_head *list, struct inode *inode) +{ + struct nfs_page *req, *next; + + list_for_each_entry(req, list, wb_list) { + if (req->wb_head != req || req->wb_this_page == req) + continue; + for (next = req->wb_this_page; + next != req->wb_head; + next = next->wb_this_page) { + nfs_list_remove_request(next); + nfs_release_request(next); + } + nfs_join_page_group(req, inode); + } +} + +static void nfs_direct_write_scan_commit_list(struct inode *inode, struct list_head *list, struct nfs_commit_info *cinfo) { mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); -#ifdef CONFIG_NFS_V4_1 - if (cinfo->ds != NULL && cinfo->ds->nwritten != 0) - NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); -#endif + pnfs_recover_commit_reqs(list, cinfo); nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0); mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); } @@ -629,11 +545,12 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) nfs_init_cinfo_from_dreq(&cinfo, dreq); nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); + nfs_direct_join_group(&reqs, dreq->inode); + dreq->count = 0; dreq->max_count = 0; list_for_each_entry(req, &reqs, wb_list) dreq->max_count += req->wb_bytes; - dreq->verf.committed = NFS_INVALID_STABLE_HOW; nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); get_dreq(dreq); @@ -670,27 +587,35 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) static void nfs_direct_commit_complete(struct nfs_commit_data *data) { + const struct nfs_writeverf *verf = data->res.verf; struct nfs_direct_req *dreq = data->dreq; struct nfs_commit_info cinfo; struct nfs_page *req; int status = data->task.tk_status; + if (status < 0) { + /* Errors in commit are fatal */ + dreq->error = status; + dreq->max_count = 0; + dreq->count = 0; + dreq->flags = NFS_ODIRECT_DONE; + } else if (dreq->flags == NFS_ODIRECT_DONE) + status = dreq->error; + nfs_init_cinfo_from_dreq(&cinfo, dreq); - if (status < 0 || nfs_direct_cmp_commit_data_verf(dreq, data)) - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) { + if (status >= 0 && !nfs_write_match_verf(verf, req)) { + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; /* * Despite the reboot, the write was successful, * so reset wb_nio. */ req->wb_nio = 0; - /* Note the rewrite will go through mds */ nfs_mark_request_commit(req, NULL, &cinfo, 0); - } else + } else /* Error or match */ nfs_release_request(req); nfs_unlock_and_release_request(req); } @@ -705,7 +630,8 @@ static void nfs_direct_resched_write(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq = cinfo->dreq; spin_lock(&dreq->lock); - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + if (dreq->flags != NFS_ODIRECT_DONE) + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; spin_unlock(&dreq->lock); nfs_mark_request_commit(req, NULL, cinfo, 0); } @@ -728,6 +654,23 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) nfs_direct_write_reschedule(dreq); } +static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq) +{ + struct nfs_commit_info cinfo; + struct nfs_page *req; + LIST_HEAD(reqs); + + nfs_init_cinfo_from_dreq(&cinfo, dreq); + nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); + + while (!list_empty(&reqs)) { + req = nfs_list_entry(reqs.next); + nfs_list_remove_request(req); + nfs_release_request(req); + nfs_unlock_and_release_request(req); + } +} + static void nfs_direct_write_schedule_work(struct work_struct *work) { struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work); @@ -742,6 +685,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) nfs_direct_write_reschedule(dreq); break; default: + nfs_direct_write_clear_reqs(dreq); nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping); nfs_direct_complete(dreq); } @@ -768,20 +712,15 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) } nfs_direct_count_bytes(dreq, hdr); - if (hdr->good_bytes != 0) { - if (nfs_write_need_commit(hdr)) { - if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) - request_commit = true; - else if (dreq->flags == 0) { - nfs_direct_set_hdr_verf(dreq, hdr); - request_commit = true; - dreq->flags = NFS_ODIRECT_DO_COMMIT; - } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) { - request_commit = true; - if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) - dreq->flags = - NFS_ODIRECT_RESCHED_WRITES; - } + if (hdr->good_bytes != 0 && nfs_write_need_commit(hdr)) { + switch (dreq->flags) { + case 0: + dreq->flags = NFS_ODIRECT_DO_COMMIT; + request_commit = true; + break; + case NFS_ODIRECT_RESCHED_WRITES: + case NFS_ODIRECT_DO_COMMIT: + request_commit = true; } } spin_unlock(&dreq->lock); @@ -990,11 +929,13 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) l_ctx = nfs_get_lock_context(dreq->ctx); if (IS_ERR(l_ctx)) { result = PTR_ERR(l_ctx); + nfs_direct_req_release(dreq); goto out_release; } dreq->l_ctx = l_ctx; if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; + pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode); nfs_start_io_direct(inode); diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c index 89bd5581f317..963800037609 100644 --- a/fs/nfs/dns_resolve.c +++ b/fs/nfs/dns_resolve.c @@ -152,12 +152,13 @@ static int nfs_dns_upcall(struct cache_detail *cd, struct cache_head *ch) { struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h); - int ret; - ret = nfs_cache_upcall(cd, key->hostname); - if (ret) - ret = sunrpc_cache_pipe_upcall(cd, ch); - return ret; + if (test_and_set_bit(CACHE_PENDING, &ch->flags)) + return 0; + if (!nfs_cache_upcall(cd, key->hostname)) + return 0; + clear_bit(CACHE_PENDING, &ch->flags); + return sunrpc_cache_pipe_upcall_timeout(cd, ch); } static int nfs_dns_match(struct cache_head *ca, diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index c9b605f6c9cb..a13e69009f19 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -49,6 +49,7 @@ MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>"); MODULE_DESCRIPTION("The NFSv4 file layout driver"); #define FILELAYOUT_POLL_RETRY_MAX (15*HZ) +static const struct pnfs_commit_ops filelayout_commit_ops; static loff_t filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg, @@ -750,72 +751,17 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg) /* This assumes a single RW lseg */ if (lseg->pls_range.iomode == IOMODE_RW) { struct nfs4_filelayout *flo; + struct inode *inode; flo = FILELAYOUT_FROM_HDR(lseg->pls_layout); - flo->commit_info.nbuckets = 0; - kfree(flo->commit_info.buckets); - flo->commit_info.buckets = NULL; + inode = flo->generic_hdr.plh_inode; + spin_lock(&inode->i_lock); + pnfs_generic_ds_cinfo_release_lseg(&flo->commit_info, lseg); + spin_unlock(&inode->i_lock); } _filelayout_free_lseg(fl); } -static int -filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo, - gfp_t gfp_flags) -{ - struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); - struct pnfs_commit_bucket *buckets; - int size, i; - - if (fl->commit_through_mds) - return 0; - - size = (fl->stripe_type == STRIPE_SPARSE) ? - fl->dsaddr->ds_num : fl->dsaddr->stripe_count; - - if (cinfo->ds->nbuckets >= size) { - /* This assumes there is only one IOMODE_RW lseg. What - * we really want to do is have a layout_hdr level - * dictionary of <multipath_list4, fh> keys, each - * associated with a struct list_head, populated by calls - * to filelayout_write_pagelist(). - * */ - return 0; - } - - buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), - gfp_flags); - if (!buckets) - return -ENOMEM; - for (i = 0; i < size; i++) { - INIT_LIST_HEAD(&buckets[i].written); - INIT_LIST_HEAD(&buckets[i].committing); - /* mark direct verifier as unset */ - buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; - } - - spin_lock(&cinfo->inode->i_lock); - if (cinfo->ds->nbuckets >= size) - goto out; - for (i = 0; i < cinfo->ds->nbuckets; i++) { - list_splice(&cinfo->ds->buckets[i].written, - &buckets[i].written); - list_splice(&cinfo->ds->buckets[i].committing, - &buckets[i].committing); - buckets[i].direct_verf.committed = - cinfo->ds->buckets[i].direct_verf.committed; - buckets[i].wlseg = cinfo->ds->buckets[i].wlseg; - buckets[i].clseg = cinfo->ds->buckets[i].clseg; - } - swap(cinfo->ds->buckets, buckets); - cinfo->ds->nbuckets = size; -out: - spin_unlock(&cinfo->inode->i_lock); - kfree(buckets); - return 0; -} - static struct pnfs_layout_segment * filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, @@ -938,9 +884,6 @@ static void filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - struct nfs_commit_info cinfo; - int status; - pnfs_generic_pg_check_layout(pgio); if (!pgio->pg_lseg) { pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, @@ -959,17 +902,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) - goto out_mds; - nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); - status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); - if (status < 0) { - pnfs_put_lseg(pgio->pg_lseg); - pgio->pg_lseg = NULL; - goto out_mds; - } - return; -out_mds: - nfs_pageio_reset_write_mds(pgio); + nfs_pageio_reset_write_mds(pgio); } static const struct nfs_pageio_ops filelayout_pg_read_ops = { @@ -1078,36 +1011,6 @@ out_err: return -EAGAIN; } -/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest - * for @page - * @cinfo - commit info for current inode - * @page - page to search for matching head request - * - * Returns a the head request if one is found, otherwise returns NULL. - */ -static struct nfs_page * -filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page) -{ - struct nfs_page *freq, *t; - struct pnfs_commit_bucket *b; - int i; - - /* Linearly search the commit lists for each bucket until a matching - * request is found */ - for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { - list_for_each_entry_safe(freq, t, &b->written, wb_list) { - if (freq->wb_page == page) - return freq->wb_head; - } - list_for_each_entry_safe(freq, t, &b->committing, wb_list) { - if (freq->wb_page == page) - return freq->wb_head; - } - } - - return NULL; -} - static int filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, int how, struct nfs_commit_info *cinfo) @@ -1140,13 +1043,17 @@ filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) struct nfs4_filelayout *flo; flo = kzalloc(sizeof(*flo), gfp_flags); - return flo != NULL ? &flo->generic_hdr : NULL; + if (flo == NULL) + return NULL; + pnfs_init_ds_commit_info(&flo->commit_info); + flo->commit_info.ops = &filelayout_commit_ops; + return &flo->generic_hdr; } static void filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo) { - kfree(FILELAYOUT_FROM_HDR(lo)); + kfree_rcu(FILELAYOUT_FROM_HDR(lo), generic_hdr.plh_rcu); } static struct pnfs_ds_commit_info * @@ -1160,6 +1067,46 @@ filelayout_get_ds_info(struct inode *inode) return &FILELAYOUT_FROM_HDR(layout)->commit_info; } +static void +filelayout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg) +{ + struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); + struct inode *inode = lseg->pls_layout->plh_inode; + struct pnfs_commit_array *array, *new; + unsigned int size = (fl->stripe_type == STRIPE_SPARSE) ? + fl->dsaddr->ds_num : fl->dsaddr->stripe_count; + + new = pnfs_alloc_commit_array(size, GFP_NOIO); + if (new) { + spin_lock(&inode->i_lock); + array = pnfs_add_commit_array(fl_cinfo, new, lseg); + spin_unlock(&inode->i_lock); + if (array != new) + pnfs_free_commit_array(new); + } +} + +static void +filelayout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, + struct inode *inode) +{ + spin_lock(&inode->i_lock); + pnfs_generic_ds_cinfo_destroy(fl_cinfo); + spin_unlock(&inode->i_lock); +} + +static const struct pnfs_commit_ops filelayout_commit_ops = { + .setup_ds_info = filelayout_setup_ds_info, + .release_ds_info = filelayout_release_ds_info, + .mark_request_commit = filelayout_mark_request_commit, + .clear_request_commit = pnfs_generic_clear_request_commit, + .scan_commit_lists = pnfs_generic_scan_commit_lists, + .recover_commit_reqs = pnfs_generic_recover_commit_reqs, + .search_commit_reqs = pnfs_generic_search_commit_reqs, + .commit_pagelist = filelayout_commit_pagelist, +}; + static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", @@ -1173,12 +1120,6 @@ static struct pnfs_layoutdriver_type filelayout_type = { .pg_read_ops = &filelayout_pg_read_ops, .pg_write_ops = &filelayout_pg_write_ops, .get_ds_info = &filelayout_get_ds_info, - .mark_request_commit = filelayout_mark_request_commit, - .clear_request_commit = pnfs_generic_clear_request_commit, - .scan_commit_lists = pnfs_generic_scan_commit_lists, - .recover_commit_reqs = pnfs_generic_recover_commit_reqs, - .search_commit_reqs = filelayout_search_commit_reqs, - .commit_pagelist = filelayout_commit_pagelist, .read_pagelist = filelayout_read_pagelist, .write_pagelist = filelayout_write_pagelist, .alloc_deviceid_node = filelayout_alloc_deviceid_node, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index bb9148b83166..7d399f72ebbb 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -32,6 +32,7 @@ static unsigned short io_maxretrans; +static const struct pnfs_commit_ops ff_layout_commit_ops; static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, struct nfs_pgio_header *hdr); static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, @@ -48,9 +49,11 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) ffl = kzalloc(sizeof(*ffl), gfp_flags); if (ffl) { + pnfs_init_ds_commit_info(&ffl->commit_info); INIT_LIST_HEAD(&ffl->error_list); INIT_LIST_HEAD(&ffl->mirrors); ffl->last_report_time = ktime_get(); + ffl->commit_info.ops = &ff_layout_commit_ops; return &ffl->generic_hdr; } else return NULL; @@ -59,14 +62,14 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) static void ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo) { + struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(lo); struct nfs4_ff_layout_ds_err *err, *n; - list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list, - list) { + list_for_each_entry_safe(err, n, &ffl->error_list, list) { list_del(&err->list); kfree(err); } - kfree(FF_LAYOUT_FROM_HDR(lo)); + kfree_rcu(ffl, generic_hdr.plh_rcu); } static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) @@ -248,36 +251,10 @@ static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror) static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls) { - int i; - - if (fls->mirror_array) { - for (i = 0; i < fls->mirror_array_cnt; i++) { - /* normally mirror_ds is freed in - * .free_deviceid_node but we still do it here - * for .alloc_lseg error path */ - ff_layout_put_mirror(fls->mirror_array[i]); - } - kfree(fls->mirror_array); - fls->mirror_array = NULL; - } -} - -static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr) -{ - int ret = 0; + u32 i; - dprintk("--> %s\n", __func__); - - /* FIXME: remove this check when layout segment support is added */ - if (lgr->range.offset != 0 || - lgr->range.length != NFS4_MAX_UINT64) { - dprintk("%s Only whole file layouts supported. Use MDS i/o\n", - __func__); - ret = -EINVAL; - } - - dprintk("--> %s returns %d\n", __func__, ret); - return ret; + for (i = 0; i < fls->mirror_array_cnt; i++) + ff_layout_put_mirror(fls->mirror_array[i]); } static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls) @@ -289,6 +266,23 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls) } static bool +ff_lseg_match_mirrors(struct pnfs_layout_segment *l1, + struct pnfs_layout_segment *l2) +{ + const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1); + const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1); + u32 i; + + if (fl1->mirror_array_cnt != fl2->mirror_array_cnt) + return false; + for (i = 0; i < fl1->mirror_array_cnt; i++) { + if (fl1->mirror_array[i] != fl2->mirror_array[i]) + return false; + } + return true; +} + +static bool ff_lseg_range_is_after(const struct pnfs_layout_range *l1, const struct pnfs_layout_range *l2) { @@ -323,6 +317,8 @@ ff_lseg_merge(struct pnfs_layout_segment *new, new->pls_range.length); if (new_end < old->pls_range.offset) return false; + if (!ff_lseg_match_mirrors(new, old)) + return false; /* Mergeable: copy info from 'old' to 'new' */ if (new_end < old_end) @@ -400,16 +396,13 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, goto out_err_free; rc = -ENOMEM; - fls = kzalloc(sizeof(*fls), gfp_flags); + fls = kzalloc(struct_size(fls, mirror_array, mirror_array_cnt), + gfp_flags); if (!fls) goto out_err_free; fls->mirror_array_cnt = mirror_array_cnt; fls->stripe_unit = stripe_unit; - fls->mirror_array = kcalloc(fls->mirror_array_cnt, - sizeof(fls->mirror_array[0]), gfp_flags); - if (fls->mirror_array == NULL) - goto out_err_free; for (i = 0; i < fls->mirror_array_cnt; i++) { struct nfs4_ff_layout_mirror *mirror; @@ -545,9 +538,6 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, out_sort_mirrors: ff_layout_sort_mirrors(fls); - rc = ff_layout_check_layout(lgr); - if (rc) - goto out_err_free; ret = &fls->generic_hdr; dprintk("<-- %s (success)\n", __func__); out_free_page: @@ -560,17 +550,6 @@ out_err_free: goto out_free_page; } -static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout) -{ - struct pnfs_layout_segment *lseg; - - list_for_each_entry(lseg, &layout->plh_segs, pls_list) - if (lseg->pls_range.iomode == IOMODE_RW) - return true; - - return false; -} - static void ff_layout_free_lseg(struct pnfs_layout_segment *lseg) { @@ -585,23 +564,12 @@ ff_layout_free_lseg(struct pnfs_layout_segment *lseg) ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout); inode = ffl->generic_hdr.plh_inode; spin_lock(&inode->i_lock); - if (!ff_layout_has_rw_segments(lseg->pls_layout)) { - ffl->commit_info.nbuckets = 0; - kfree(ffl->commit_info.buckets); - ffl->commit_info.buckets = NULL; - } + pnfs_generic_ds_cinfo_release_lseg(&ffl->commit_info, lseg); spin_unlock(&inode->i_lock); } _ff_layout_free_lseg(fls); } -/* Return 1 until we have multiple lsegs support */ -static int -ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls) -{ - return 1; -} - static void nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now) { @@ -746,52 +714,6 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, spin_unlock(&mirror->lock); } -static int -ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo, - gfp_t gfp_flags) -{ - struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); - struct pnfs_commit_bucket *buckets; - int size; - - if (cinfo->ds->nbuckets != 0) { - /* This assumes there is only one RW lseg per file. - * To support multiple lseg per file, we need to - * change struct pnfs_commit_bucket to allow dynamic - * increasing nbuckets. - */ - return 0; - } - - size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg); - - buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), - gfp_flags); - if (!buckets) - return -ENOMEM; - else { - int i; - - spin_lock(&cinfo->inode->i_lock); - if (cinfo->ds->nbuckets != 0) - kfree(buckets); - else { - cinfo->ds->buckets = buckets; - cinfo->ds->nbuckets = size; - for (i = 0; i < size; i++) { - INIT_LIST_HEAD(&buckets[i].written); - INIT_LIST_HEAD(&buckets[i].committing); - /* mark direct verifier as unset */ - buckets[i].direct_verf.committed = - NFS_INVALID_STABLE_HOW; - } - } - spin_unlock(&cinfo->inode->i_lock); - return 0; - } -} - static void ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx) { @@ -876,8 +798,8 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), - 0, - NFS4_MAX_UINT64, + req_offset(req), + req->wb_bytes, IOMODE_READ, strict_iomode, GFP_KERNEL); @@ -888,6 +810,14 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, } static void +ff_layout_pg_check_layout(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + pnfs_generic_pg_check_layout(pgio); + pnfs_generic_pg_check_range(pgio, req); +} + +static void ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { @@ -897,7 +827,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, int ds_idx; retry: - pnfs_generic_pg_check_layout(pgio); + ff_layout_pg_check_layout(pgio, req); /* Use full layout for now */ if (!pgio->pg_lseg) { ff_layout_pg_get_read(pgio, req, false); @@ -953,18 +883,16 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, { struct nfs4_ff_layout_mirror *mirror; struct nfs_pgio_mirror *pgm; - struct nfs_commit_info cinfo; struct nfs4_pnfs_ds *ds; int i; - int status; retry: - pnfs_generic_pg_check_layout(pgio); + ff_layout_pg_check_layout(pgio, req); if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req), - 0, - NFS4_MAX_UINT64, + req_offset(req), + req->wb_bytes, IOMODE_RW, false, GFP_NOFS); @@ -978,11 +906,6 @@ retry: if (pgio->pg_lseg == NULL) goto out_mds; - nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); - status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); - if (status < 0) - goto out_mds; - /* Use a direct mapping of ds_idx to pgio mirror_idx */ if (WARN_ON_ONCE(pgio->pg_mirror_count != FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg))) @@ -1297,21 +1220,23 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, } } + mirror = FF_LAYOUT_COMP(lseg, idx); + err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), + mirror, offset, length, status, opnum, + GFP_NOIO); + switch (status) { case NFS4ERR_DELAY: case NFS4ERR_GRACE: - return; - default: break; + case NFS4ERR_NXIO: + ff_layout_mark_ds_unreachable(lseg, idx); + /* Fallthrough */ + default: + pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, + lseg); } - mirror = FF_LAYOUT_COMP(lseg, idx); - err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), - mirror, offset, length, status, opnum, - GFP_NOIO); - if (status == NFS4ERR_NXIO) - ff_layout_mark_ds_unreachable(lseg, idx); - pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg); dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status); } @@ -2012,6 +1937,33 @@ ff_layout_get_ds_info(struct inode *inode) } static void +ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg) +{ + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); + struct inode *inode = lseg->pls_layout->plh_inode; + struct pnfs_commit_array *array, *new; + + new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, GFP_NOIO); + if (new) { + spin_lock(&inode->i_lock); + array = pnfs_add_commit_array(fl_cinfo, new, lseg); + spin_unlock(&inode->i_lock); + if (array != new) + pnfs_free_commit_array(new); + } +} + +static void +ff_layout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, + struct inode *inode) +{ + spin_lock(&inode->i_lock); + pnfs_generic_ds_cinfo_destroy(fl_cinfo); + spin_unlock(&inode->i_lock); +} + +static void ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d) { nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds, @@ -2496,6 +2448,16 @@ ff_layout_set_layoutdriver(struct nfs_server *server, return 0; } +static const struct pnfs_commit_ops ff_layout_commit_ops = { + .setup_ds_info = ff_layout_setup_ds_info, + .release_ds_info = ff_layout_release_ds_info, + .mark_request_commit = pnfs_layout_mark_request_commit, + .clear_request_commit = pnfs_generic_clear_request_commit, + .scan_commit_lists = pnfs_generic_scan_commit_lists, + .recover_commit_reqs = pnfs_generic_recover_commit_reqs, + .commit_pagelist = ff_layout_commit_pagelist, +}; + static struct pnfs_layoutdriver_type flexfilelayout_type = { .id = LAYOUT_FLEX_FILES, .name = "LAYOUT_FLEX_FILES", @@ -2512,11 +2474,6 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .pg_write_ops = &ff_layout_pg_write_ops, .get_ds_info = ff_layout_get_ds_info, .free_deviceid_node = ff_layout_free_deviceid_node, - .mark_request_commit = pnfs_layout_mark_request_commit, - .clear_request_commit = pnfs_generic_clear_request_commit, - .scan_commit_lists = pnfs_generic_scan_commit_lists, - .recover_commit_reqs = pnfs_generic_recover_commit_reqs, - .commit_pagelist = ff_layout_commit_pagelist, .read_pagelist = ff_layout_read_pagelist, .write_pagelist = ff_layout_write_pagelist, .alloc_deviceid_node = ff_layout_alloc_deviceid_node, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 2f369966abf7..354a031c69b1 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -99,7 +99,7 @@ struct nfs4_ff_layout_segment { u64 stripe_unit; u32 flags; u32 mirror_array_cnt; - struct nfs4_ff_layout_mirror **mirror_array; + struct nfs4_ff_layout_mirror *mirror_array[]; }; struct nfs4_flexfile_layout { diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index e113fcb4bb4c..ccc88be88d6a 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -190,6 +190,7 @@ static const struct constant_table nfs_vers_tokens[] = { { "4.0", Opt_vers_4_0 }, { "4.1", Opt_vers_4_1 }, { "4.2", Opt_vers_4_2 }, + {} }; enum { @@ -202,13 +203,14 @@ enum { nr__Opt_xprt }; -static const struct constant_table nfs_xprt_protocol_tokens[nr__Opt_xprt] = { +static const struct constant_table nfs_xprt_protocol_tokens[] = { { "rdma", Opt_xprt_rdma }, { "rdma6", Opt_xprt_rdma6 }, { "tcp", Opt_xprt_tcp }, { "tcp6", Opt_xprt_tcp6 }, { "udp", Opt_xprt_udp }, { "udp6", Opt_xprt_udp6 }, + {} }; enum { @@ -239,6 +241,7 @@ static const struct constant_table nfs_secflavor_tokens[] = { { "spkm3i", Opt_sec_spkmi }, { "spkm3p", Opt_sec_spkmp }, { "sys", Opt_sec_sys }, + {} }; /* @@ -1135,7 +1138,7 @@ out_no_address: return nfs_invalf(fc, "NFS4: mount program didn't pass remote address"); out_invalid_transport_udp: - return nfs_invalf(fc, "NFSv4: Unsupported transport protocol udp"); + return nfs_invalf(fc, "NFS: Unsupported transport protocol udp"); } #endif @@ -1257,7 +1260,7 @@ out_v4_not_compiled: nfs_errorf(fc, "NFS: NFSv4 is not compiled into kernel"); return -EPROTONOSUPPORT; out_invalid_transport_udp: - return nfs_invalf(fc, "NFSv4: Unsupported transport protocol udp"); + return nfs_invalf(fc, "NFS: Unsupported transport protocol udp"); out_no_address: return nfs_invalf(fc, "NFS: mount program didn't pass remote address"); out_mountproto_mismatch: diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index b012c2668a1f..aaeeb4659bff 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -73,6 +73,7 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) struct inode *inode; char *name; int error = -ENOMEM; + unsigned long kflags = 0, kflags_out = 0; name = kstrdup(fc->source, GFP_KERNEL); if (!name) @@ -83,11 +84,14 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) if (fsinfo.fattr == NULL) goto out_name; + fsinfo.fattr->label = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(fsinfo.fattr->label)) + goto out_fattr; error = server->nfs_client->rpc_ops->getroot(server, ctx->mntfh, &fsinfo); if (error < 0) { dprintk("nfs_get_root: getattr error = %d\n", -error); nfs_errorf(fc, "NFS: Couldn't getattr on root"); - goto out_fattr; + goto out_label; } inode = nfs_fhget(s, ctx->mntfh, fsinfo.fattr, NULL); @@ -95,12 +99,12 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) dprintk("nfs_get_root: get root inode failed\n"); error = PTR_ERR(inode); nfs_errorf(fc, "NFS: Couldn't get root inode"); - goto out_fattr; + goto out_label; } error = nfs_superblock_set_dummy_root(s, inode); if (error != 0) - goto out_fattr; + goto out_label; /* root dentries normally start off anonymous and get spliced in later * if the dentry tree reaches them; however if the dentry already @@ -111,7 +115,7 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) dprintk("nfs_get_root: get root dentry failed\n"); error = PTR_ERR(root); nfs_errorf(fc, "NFS: Couldn't get root dentry"); - goto out_fattr; + goto out_label; } security_d_instantiate(root, inode); @@ -123,12 +127,39 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) } spin_unlock(&root->d_lock); fc->root = root; + if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL) + kflags |= SECURITY_LSM_NATIVE_LABELS; + if (ctx->clone_data.sb) { + if (d_inode(fc->root)->i_fop != &nfs_dir_operations) { + error = -ESTALE; + goto error_splat_root; + } + /* clone lsm security options from the parent to the new sb */ + error = security_sb_clone_mnt_opts(ctx->clone_data.sb, + s, kflags, &kflags_out); + } else { + error = security_sb_set_mnt_opts(s, fc->security, + kflags, &kflags_out); + } + if (error) + goto error_splat_root; + if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL && + !(kflags_out & SECURITY_LSM_NATIVE_LABELS)) + NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL; + + nfs_setsecurity(inode, fsinfo.fattr, fsinfo.fattr->label); error = 0; +out_label: + nfs4_label_free(fsinfo.fattr->label); out_fattr: nfs_free_fattr(fsinfo.fattr); out_name: kfree(name); out: return error; +error_splat_root: + dput(fc->root); + fc->root = NULL; + goto out_label; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 11bf15800ac9..b9d0921cb4fe 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -62,7 +62,6 @@ /* Default is to see 64-bit inode numbers */ static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; -static void nfs_invalidate_inode(struct inode *); static int nfs_update_inode(struct inode *, struct nfs_fattr *); static struct kmem_cache * nfs_inode_cachep; @@ -284,10 +283,18 @@ EXPORT_SYMBOL_GPL(nfs_invalidate_atime); * Invalidate, but do not unhash, the inode. * NB: must be called with inode->i_lock held! */ -static void nfs_invalidate_inode(struct inode *inode) +static void nfs_set_inode_stale_locked(struct inode *inode) { set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); nfs_zap_caches_locked(inode); + trace_nfs_set_inode_stale(inode); +} + +void nfs_set_inode_stale(struct inode *inode) +{ + spin_lock(&inode->i_lock); + nfs_set_inode_stale_locked(inode); + spin_unlock(&inode->i_lock); } struct nfs_find_desc { @@ -959,16 +966,16 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct file *filp) { struct nfs_open_context *ctx; - const struct cred *cred = get_current_cred(); ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) { - put_cred(cred); + if (!ctx) return ERR_PTR(-ENOMEM); - } nfs_sb_active(dentry->d_sb); ctx->dentry = dget(dentry); - ctx->cred = cred; + if (filp) + ctx->cred = get_cred(filp->f_cred); + else + ctx->cred = get_current_cred(); ctx->ll_cred = NULL; ctx->state = NULL; ctx->mode = f_mode; @@ -1163,9 +1170,10 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) status = 0; break; case -ESTALE: - nfs_zap_caches(inode); if (!S_ISDIR(inode->i_mode)) - set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + nfs_set_inode_stale(inode); + else + nfs_zap_caches(inode); } goto err_out; } @@ -2064,7 +2072,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) * lookup validation will know that the inode is bad. * (But we fall through to invalidate the caches.) */ - nfs_invalidate_inode(inode); + nfs_set_inode_stale_locked(inode); return -ESTALE; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index f80c47d5ff27..1f32a9fbfdaf 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -274,12 +274,6 @@ void nfs_free_request(struct nfs_page *req); struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc); -static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc) -{ - WARN_ON_ONCE(desc->pg_mirror_count < 1); - return desc->pg_mirror_count > 1; -} - static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1, const struct nfs_open_context *ctx2) { @@ -417,7 +411,9 @@ extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); extern bool nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); - +extern int nfs_client_for_each_server(struct nfs_client *clp, + int (*fn)(struct nfs_server *, void *), + void *data); /* io.c */ extern void nfs_start_io_read(struct inode *inode); extern void nfs_end_io_read(struct inode *inode); @@ -515,13 +511,25 @@ int nfs_filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); #ifdef CONFIG_NFS_V4_1 +static inline void +pnfs_bucket_clear_pnfs_ds_commit_verifiers(struct pnfs_commit_bucket *buckets, + unsigned int nbuckets) +{ + unsigned int i; + + for (i = 0; i < nbuckets; i++) + buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; +} static inline void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) { - int i; + struct pnfs_commit_array *array; - for (i = 0; i < cinfo->nbuckets; i++) - cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; + rcu_read_lock(); + list_for_each_entry_rcu(array, &cinfo->commits, cinfo_list) + pnfs_bucket_clear_pnfs_ds_commit_verifiers(array->buckets, + array->nbuckets); + rcu_read_unlock(); } #else static inline @@ -542,6 +550,14 @@ nfs_write_verifier_cmp(const struct nfs_write_verifier *v1, return memcmp(v1->data, v2->data, sizeof(v1->data)); } +static inline bool +nfs_write_match_verf(const struct nfs_writeverf *verf, + struct nfs_page *req) +{ + return verf->committed > NFS_UNSTABLE && + !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier); +} + /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index f3ece8ed3203..6b063227e34e 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -145,6 +145,7 @@ struct vfsmount *nfs_d_automount(struct path *path) struct vfsmount *mnt = ERR_PTR(-ENOMEM); struct nfs_server *server = NFS_SERVER(d_inode(path->dentry)); struct nfs_client *client = server->nfs_client; + int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout); int ret; if (IS_ROOT(path->dentry)) @@ -190,12 +191,12 @@ struct vfsmount *nfs_d_automount(struct path *path) if (IS_ERR(mnt)) goto out_fc; - if (nfs_mountpoint_expiry_timeout < 0) + mntget(mnt); /* prevent immediate expiration */ + if (timeout <= 0) goto out_fc; - mntget(mnt); /* prevent immediate expiration */ mnt_set_expiry(mnt, &nfs_automount_list); - schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); + schedule_delayed_work(&nfs_automount_task, timeout); out_fc: put_fs_context(fc); @@ -233,10 +234,11 @@ const struct inode_operations nfs_referral_inode_operations = { static void nfs_expire_automounts(struct work_struct *work) { struct list_head *list = &nfs_automount_list; + int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout); mark_mounts_for_expiry(list); - if (!list_empty(list)) - schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); + if (!list_empty(list) && timeout > 0) + schedule_delayed_work(&nfs_automount_task, timeout); } void nfs_release_automount_timer(void) @@ -247,10 +249,7 @@ void nfs_release_automount_timer(void) /** * nfs_do_submount - set up mountpoint when crossing a filesystem boundary - * @dentry: parent directory - * @fh: filehandle for new root dentry - * @fattr: attributes for new root inode - * @authflavor: security flavor to use when performing the mount + * @fc: pointer to struct nfs_fs_context * */ int nfs_do_submount(struct fs_context *fc) @@ -312,3 +311,53 @@ int nfs_submount(struct fs_context *fc, struct nfs_server *server) return nfs_do_submount(fc); } EXPORT_SYMBOL_GPL(nfs_submount); + +static int param_set_nfs_timeout(const char *val, const struct kernel_param *kp) +{ + long num; + int ret; + + if (!val) + return -EINVAL; + ret = kstrtol(val, 0, &num); + if (ret) + return -EINVAL; + if (num > 0) { + if (num >= INT_MAX / HZ) + num = INT_MAX; + else + num *= HZ; + *((int *)kp->arg) = num; + if (!list_empty(&nfs_automount_list)) + mod_delayed_work(system_wq, &nfs_automount_task, num); + } else { + *((int *)kp->arg) = -1*HZ; + cancel_delayed_work(&nfs_automount_task); + } + return 0; +} + +static int param_get_nfs_timeout(char *buffer, const struct kernel_param *kp) +{ + long num = *((int *)kp->arg); + + if (num > 0) { + if (num >= INT_MAX - (HZ - 1)) + num = INT_MAX / HZ; + else + num = (num + (HZ - 1)) / HZ; + } else + num = -1; + return scnprintf(buffer, PAGE_SIZE, "%li\n", num); +} + +static const struct kernel_param_ops param_ops_nfs_timeout = { + .set = param_set_nfs_timeout, + .get = param_get_nfs_timeout, +}; +#define param_check_nfs_timeout(name, p) __param_check(name, p, int); + +module_param(nfs_mountpoint_expiry_timeout, nfs_timeout, 0644); +MODULE_PARM_DESC(nfs_mountpoint_expiry_timeout, + "Set the NFS automounted mountpoint timeout value (seconds)." + "Values <= 0 turn expiration off."); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 8be1ba7c62bb..2b7f6dcd2eb8 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -42,7 +42,9 @@ enum nfs4_client_state { NFS4CLNT_LEASE_MOVED, NFS4CLNT_DELEGATION_EXPIRED, NFS4CLNT_RUN_MANAGER, - NFS4CLNT_DELEGRETURN_RUNNING, + NFS4CLNT_RECALL_RUNNING, + NFS4CLNT_RECALL_ANY_LAYOUT_READ, + NFS4CLNT_RECALL_ANY_LAYOUT_RW, }; #define NFS4_RENEW_TIMEOUT 0x01 diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 1297919e0fce..8e5d6223ddd3 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -252,6 +252,9 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off, if (remap_flags & ~REMAP_FILE_ADVISORY) return -EINVAL; + if (IS_SWAPFILE(dst_inode) || IS_SWAPFILE(src_inode)) + return -ETXTBSY; + /* check alignment w.r.t. clone_blksize */ ret = -EINVAL; if (bs) { diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 84026e7b8a5f..a3ab6e219061 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -354,7 +354,7 @@ static int try_location(struct fs_context *fc, /** * nfs_follow_referral - set up mountpoint when hitting a referral on moved error - * @dentry: parent directory + * @fc: pointer to struct nfs_fs_context * @locations: array of NFSv4 server location information * */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 69b7ab7a5815..512afb1c7867 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2346,7 +2346,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) .callback_ops = &nfs4_open_confirm_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; int status; @@ -2511,7 +2511,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, .callback_ops = &nfs4_open_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; int status; @@ -2790,16 +2790,19 @@ static int nfs41_check_delegation_stateid(struct nfs4_state *state) return NFS_OK; } + spin_lock(&delegation->lock); nfs4_stateid_copy(&stateid, &delegation->stateid); if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags)) { + spin_unlock(&delegation->lock); rcu_read_unlock(); return NFS_OK; } if (delegation->cred) cred = get_cred(delegation->cred); + spin_unlock(&delegation->lock); rcu_read_unlock(); status = nfs41_test_and_free_expired_stateid(server, &stateid, cred); trace_nfs4_test_delegation_stateid(state, NULL, status); @@ -3651,7 +3654,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) .rpc_message = &msg, .callback_ops = &nfs4_close_ops, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; int status = -ENOMEM; @@ -4002,7 +4005,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh, { int error; struct nfs_fattr *fattr = info->fattr; - struct nfs4_label *label = NULL; + struct nfs4_label *label = fattr->label; error = nfs4_server_capabilities(server, mntfh); if (error < 0) { @@ -4010,23 +4013,17 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *mntfh, return error; } - label = nfs4_label_alloc(server, GFP_KERNEL); - if (IS_ERR(label)) - return PTR_ERR(label); - error = nfs4_proc_getattr(server, mntfh, fattr, label, NULL); if (error < 0) { dprintk("nfs4_get_root: getattr error = %d\n", -error); - goto err_free_label; + goto out; } if (fattr->valid & NFS_ATTR_FATTR_FSID && !nfs_fsid_equal(&server->fsid, &fattr->fsid)) memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid)); -err_free_label: - nfs4_label_free(label); - +out: return error; } @@ -5550,7 +5547,7 @@ unwind: struct nfs4_cached_acl { int cached; size_t len; - char data[0]; + char data[]; }; static void nfs4_set_cached_acl(struct inode *inode, struct nfs4_cached_acl *acl) @@ -6259,6 +6256,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) /* Fallthrough */ case -NFS4ERR_BAD_STATEID: case -NFS4ERR_STALE_STATEID: + case -ETIMEDOUT: task->tk_status = 0; break; case -NFS4ERR_OLD_STATEID: @@ -6349,7 +6347,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, .rpc_client = server->client, .rpc_message = &msg, .callback_ops = &nfs4_delegreturn_ops, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | RPC_TASK_TIMEOUT, }; int status = 0; @@ -6932,7 +6930,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f .rpc_message = &msg, .callback_ops = &nfs4_lock_ops, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; int ret; @@ -9176,7 +9174,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) .rpc_message = &msg, .callback_ops = &nfs4_layoutget_call_ops, .callback_data = lgp, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; struct pnfs_layout_segment *lseg = NULL; struct nfs4_exception exception = { @@ -9293,6 +9291,7 @@ static void nfs4_layoutreturn_release(void *calldata) lrp->ld_private.ops->free(&lrp->ld_private); pnfs_put_layout_hdr(lrp->args.layout); nfs_iput_and_deactive(lrp->inode); + put_cred(lrp->cred); kfree(calldata); dprintk("<-- %s\n", __func__); } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f7723d221945..ac93715c05a4 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2524,6 +2524,21 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp) } return 0; } + +static void nfs4_layoutreturn_any_run(struct nfs_client *clp) +{ + int iomode = 0; + + if (test_and_clear_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &clp->cl_state)) + iomode += IOMODE_READ; + if (test_and_clear_bit(NFS4CLNT_RECALL_ANY_LAYOUT_RW, &clp->cl_state)) + iomode += IOMODE_RW; + /* Note: IOMODE_READ + IOMODE_RW == IOMODE_ANY */ + if (iomode) { + pnfs_layout_return_unused_byclid(clp, iomode); + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); + } +} #else /* CONFIG_NFS_V4_1 */ static int nfs4_reset_session(struct nfs_client *clp) { return 0; } @@ -2531,6 +2546,10 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp) { return 0; } + +static void nfs4_layoutreturn_any_run(struct nfs_client *clp) +{ +} #endif /* CONFIG_NFS_V4_1 */ static void nfs4_state_manager(struct nfs_client *clp) @@ -2635,12 +2654,13 @@ static void nfs4_state_manager(struct nfs_client *clp) nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); - if (!test_and_set_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state)) { + if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) { if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { nfs_client_return_marked_delegations(clp); set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); } - clear_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state); + nfs4_layoutreturn_any_run(clp); + clear_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state); } /* Did we race with an attempt to give us more work? */ diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 1e97e5e04cb4..543541173a3d 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -584,7 +584,9 @@ TRACE_DEFINE_ENUM(NFS4CLNT_MOVED); TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_MOVED); TRACE_DEFINE_ENUM(NFS4CLNT_DELEGATION_EXPIRED); TRACE_DEFINE_ENUM(NFS4CLNT_RUN_MANAGER); -TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_RUNNING); +TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_RUNNING); +TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_READ); +TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_RW); #define show_nfs4_clp_state(state) \ __print_flags(state, "|", \ @@ -605,7 +607,9 @@ TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_RUNNING); { NFS4CLNT_LEASE_MOVED, "LEASE_MOVED" }, \ { NFS4CLNT_DELEGATION_EXPIRED, "DELEGATION_EXPIRED" }, \ { NFS4CLNT_RUN_MANAGER, "RUN_MANAGER" }, \ - { NFS4CLNT_DELEGRETURN_RUNNING, "DELEGRETURN_RUNNING" }) + { NFS4CLNT_RECALL_RUNNING, "RECALL_RUNNING" }, \ + { NFS4CLNT_RECALL_ANY_LAYOUT_READ, "RECALL_ANY_LAYOUT_READ" }, \ + { NFS4CLNT_RECALL_ANY_LAYOUT_RW, "RECALL_ANY_LAYOUT_RW" }) TRACE_EVENT(nfs4_state_mgr, TP_PROTO( diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index effaa4247b91..8d3278805602 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -88,7 +88,7 @@ #define NFS_ROOT "/tftpboot/%s" /* Default NFSROOT mount options. */ -#define NFS_DEF_OPTIONS "vers=2,udp,rsize=4096,wsize=4096" +#define NFS_DEF_OPTIONS "vers=2,tcp,rsize=4096,wsize=4096" /* Parameters passed from the kernel command line */ static char nfs_root_parms[NFS_MAXPATHLEN + 1] __initdata = ""; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index a9588d19a5ae..7e7a97ae21ed 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -181,6 +181,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event_done, int error \ ), \ TP_ARGS(inode, error)) +DEFINE_NFS_INODE_EVENT(nfs_set_inode_stale); DEFINE_NFS_INODE_EVENT(nfs_refresh_inode_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_refresh_inode_exit); DEFINE_NFS_INODE_EVENT(nfs_revalidate_inode_enter); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 20b3717cd7ca..f61f96603df7 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -33,9 +33,7 @@ static const struct rpc_call_ops nfs_pgio_common_ops; struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc) { - return nfs_pgio_has_mirroring(desc) ? - &desc->pg_mirrors[desc->pg_mirror_idx] : - &desc->pg_mirrors[0]; + return &desc->pg_mirrors[desc->pg_mirror_idx]; } EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror); @@ -133,47 +131,166 @@ nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx) EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait); /* - * nfs_page_group_lock - lock the head of the page group - * @req - request in group that is to be locked + * nfs_page_lock_head_request - page lock the head of the page group + * @req: any member of the page group + */ +struct nfs_page * +nfs_page_group_lock_head(struct nfs_page *req) +{ + struct nfs_page *head = req->wb_head; + + while (!nfs_lock_request(head)) { + int ret = nfs_wait_on_request(head); + if (ret < 0) + return ERR_PTR(ret); + } + if (head != req) + kref_get(&head->wb_kref); + return head; +} + +/* + * nfs_unroll_locks - unlock all newly locked reqs and wait on @req + * @head: head request of page group, must be holding head lock + * @req: request that couldn't lock and needs to wait on the req bit lock * - * this lock must be held when traversing or modifying the page - * group list + * This is a helper function for nfs_lock_and_join_requests + * returns 0 on success, < 0 on error. + */ +static void +nfs_unroll_locks(struct nfs_page *head, struct nfs_page *req) +{ + struct nfs_page *tmp; + + /* relinquish all the locks successfully grabbed this run */ + for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) { + if (!kref_read(&tmp->wb_kref)) + continue; + nfs_unlock_and_release_request(tmp); + } +} + +/* + * nfs_page_group_lock_subreq - try to lock a subrequest + * @head: head request of page group + * @subreq: request to lock * - * return 0 on success, < 0 on error + * This is a helper function for nfs_lock_and_join_requests which + * must be called with the head request and page group both locked. + * On error, it returns with the page group unlocked. */ -int -nfs_page_group_lock(struct nfs_page *req) +static int +nfs_page_group_lock_subreq(struct nfs_page *head, struct nfs_page *subreq) { - struct nfs_page *head = req->wb_head; + int ret; + + if (!kref_get_unless_zero(&subreq->wb_kref)) + return 0; + while (!nfs_lock_request(subreq)) { + nfs_page_group_unlock(head); + ret = nfs_wait_on_request(subreq); + if (!ret) + ret = nfs_page_group_lock(head); + if (ret < 0) { + nfs_unroll_locks(head, subreq); + nfs_release_request(subreq); + return ret; + } + } + return 0; +} + +/* + * nfs_page_group_lock_subrequests - try to lock the subrequests + * @head: head request of page group + * + * This is a helper function for nfs_lock_and_join_requests which + * must be called with the head request locked. + */ +int nfs_page_group_lock_subrequests(struct nfs_page *head) +{ + struct nfs_page *subreq; + int ret; - WARN_ON_ONCE(head != head->wb_head); + ret = nfs_page_group_lock(head); + if (ret < 0) + return ret; + /* lock each request in the page group */ + for (subreq = head->wb_this_page; subreq != head; + subreq = subreq->wb_this_page) { + ret = nfs_page_group_lock_subreq(head, subreq); + if (ret < 0) + return ret; + } + nfs_page_group_unlock(head); + return 0; +} - if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags)) +/* + * nfs_page_set_headlock - set the request PG_HEADLOCK + * @req: request that is to be locked + * + * this lock must be held when modifying req->wb_head + * + * return 0 on success, < 0 on error + */ +int +nfs_page_set_headlock(struct nfs_page *req) +{ + if (!test_and_set_bit(PG_HEADLOCK, &req->wb_flags)) return 0; - set_bit(PG_CONTENDED1, &head->wb_flags); + set_bit(PG_CONTENDED1, &req->wb_flags); smp_mb__after_atomic(); - return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, + return wait_on_bit_lock(&req->wb_flags, PG_HEADLOCK, TASK_UNINTERRUPTIBLE); } /* - * nfs_page_group_unlock - unlock the head of the page group - * @req - request in group that is to be unlocked + * nfs_page_clear_headlock - clear the request PG_HEADLOCK + * @req: request that is to be locked */ void -nfs_page_group_unlock(struct nfs_page *req) +nfs_page_clear_headlock(struct nfs_page *req) { - struct nfs_page *head = req->wb_head; - - WARN_ON_ONCE(head != head->wb_head); - smp_mb__before_atomic(); - clear_bit(PG_HEADLOCK, &head->wb_flags); + clear_bit(PG_HEADLOCK, &req->wb_flags); smp_mb__after_atomic(); - if (!test_bit(PG_CONTENDED1, &head->wb_flags)) + if (!test_bit(PG_CONTENDED1, &req->wb_flags)) return; - wake_up_bit(&head->wb_flags, PG_HEADLOCK); + wake_up_bit(&req->wb_flags, PG_HEADLOCK); +} + +/* + * nfs_page_group_lock - lock the head of the page group + * @req: request in group that is to be locked + * + * this lock must be held when traversing or modifying the page + * group list + * + * return 0 on success, < 0 on error + */ +int +nfs_page_group_lock(struct nfs_page *req) +{ + int ret; + + ret = nfs_page_set_headlock(req); + if (ret || req->wb_head == req) + return ret; + return nfs_page_set_headlock(req->wb_head); +} + +/* + * nfs_page_group_unlock - unlock the head of the page group + * @req: request in group that is to be unlocked + */ +void +nfs_page_group_unlock(struct nfs_page *req) +{ + if (req != req->wb_head) + nfs_page_clear_headlock(req->wb_head); + nfs_page_clear_headlock(req); } /* @@ -359,15 +476,23 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page, } static struct nfs_page * -nfs_create_subreq(struct nfs_page *req, struct nfs_page *last, - unsigned int pgbase, unsigned int offset, +nfs_create_subreq(struct nfs_page *req, + unsigned int pgbase, + unsigned int offset, unsigned int count) { + struct nfs_page *last; struct nfs_page *ret; ret = __nfs_create_request(req->wb_lock_context, req->wb_page, pgbase, offset, count); if (!IS_ERR(ret)) { + /* find the last request */ + for (last = req->wb_head; + last->wb_this_page != req->wb_head; + last = last->wb_this_page) + ; + nfs_lock_request(ret); ret->wb_index = req->wb_index; nfs_page_group_init(ret, last); @@ -627,9 +752,8 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, .callback_ops = call_ops, .callback_data = hdr, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC | flags, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | flags, }; - int ret = 0; hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how); @@ -641,18 +765,10 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, (unsigned long long)hdr->args.offset); task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - ret = PTR_ERR(task); - goto out; - } - if (how & FLUSH_SYNC) { - ret = rpc_wait_for_completion_task(task); - if (ret == 0) - ret = task->tk_status; - } + if (IS_ERR(task)) + return PTR_ERR(task); rpc_put_task(task); -out: - return ret; + return 0; } EXPORT_SYMBOL_GPL(nfs_initiate_pgio); @@ -886,15 +1002,6 @@ static void nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio, pgio->pg_mirror_count = mirror_count; } -/* - * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1) - */ -void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio) -{ - pgio->pg_mirror_count = 1; - pgio->pg_mirror_idx = 0; -} - static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio) { pgio->pg_mirror_count = 1; @@ -911,7 +1018,7 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1, } /** - * nfs_can_coalesce_requests - test two requests for compatibility + * nfs_coalesce_size - test two requests for compatibility * @prev: pointer to nfs_page * @req: pointer to nfs_page * @pgio: pointer to nfs_pagio_descriptor @@ -920,41 +1027,36 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1, * page data area they describe is contiguous, and that their RPC * credentials, NFSv4 open state, and lockowners are the same. * - * Return 'true' if this is the case, else return 'false'. + * Returns size of the request that can be coalesced */ -static bool nfs_can_coalesce_requests(struct nfs_page *prev, +static unsigned int nfs_coalesce_size(struct nfs_page *prev, struct nfs_page *req, struct nfs_pageio_descriptor *pgio) { - size_t size; struct file_lock_context *flctx; if (prev) { if (!nfs_match_open_context(nfs_req_openctx(req), nfs_req_openctx(prev))) - return false; + return 0; flctx = d_inode(nfs_req_openctx(req)->dentry)->i_flctx; if (flctx != NULL && !(list_empty_careful(&flctx->flc_posix) && list_empty_careful(&flctx->flc_flock)) && !nfs_match_lock_context(req->wb_lock_context, prev->wb_lock_context)) - return false; + return 0; if (req_offset(req) != req_offset(prev) + prev->wb_bytes) - return false; + return 0; if (req->wb_page == prev->wb_page) { if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes) - return false; + return 0; } else { if (req->wb_pgbase != 0 || prev->wb_pgbase + prev->wb_bytes != PAGE_SIZE) - return false; + return 0; } } - size = pgio->pg_ops->pg_test(pgio, prev, req); - WARN_ON_ONCE(size > req->wb_bytes); - if (size && size < req->wb_bytes) - req->wb_bytes = size; - return size > 0; + return pgio->pg_ops->pg_test(pgio, prev, req); } /** @@ -962,15 +1064,16 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, * @desc: destination io descriptor * @req: request * - * Returns true if the request 'req' was successfully coalesced into the - * existing list of pages 'desc'. + * If the request 'req' was successfully coalesced into the existing list + * of pages 'desc', it returns the size of req. */ -static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, - struct nfs_page *req) +static unsigned int +nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) { struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); - struct nfs_page *prev = NULL; + unsigned int size; if (mirror->pg_count != 0) { prev = nfs_list_entry(mirror->pg_list.prev); @@ -990,11 +1093,12 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, return 0; } - if (!nfs_can_coalesce_requests(prev, req, desc)) - return 0; + size = nfs_coalesce_size(prev, req, desc); + if (size < req->wb_bytes) + return size; nfs_list_move_request(req, &mirror->pg_list); mirror->pg_count += req->wb_bytes; - return 1; + return req->wb_bytes; } /* @@ -1034,7 +1138,8 @@ nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc, * @req: request * * This may split a request into subrequests which are all part of the - * same page group. + * same page group. If so, it will submit @req as the last one, to ensure + * the pointer to @req is still valid in case of failure. * * Returns true if the request 'req' was successfully coalesced into the * existing list of pages 'desc'. @@ -1043,51 +1148,50 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); - struct nfs_page *subreq; - unsigned int bytes_left = 0; - unsigned int offset, pgbase; + unsigned int size, subreq_size; nfs_page_group_lock(req); subreq = req; - bytes_left = subreq->wb_bytes; - offset = subreq->wb_offset; - pgbase = subreq->wb_pgbase; - - do { - if (!nfs_pageio_do_add_request(desc, subreq)) { - /* make sure pg_test call(s) did nothing */ - WARN_ON_ONCE(subreq->wb_bytes != bytes_left); - WARN_ON_ONCE(subreq->wb_offset != offset); - WARN_ON_ONCE(subreq->wb_pgbase != pgbase); - + subreq_size = subreq->wb_bytes; + for(;;) { + size = nfs_pageio_do_add_request(desc, subreq); + if (size == subreq_size) { + /* We successfully submitted a request */ + if (subreq == req) + break; + req->wb_pgbase += size; + req->wb_bytes -= size; + req->wb_offset += size; + subreq_size = req->wb_bytes; + subreq = req; + continue; + } + if (WARN_ON_ONCE(subreq != req)) { + nfs_page_group_unlock(req); + nfs_pageio_cleanup_request(desc, subreq); + subreq = req; + subreq_size = req->wb_bytes; + nfs_page_group_lock(req); + } + if (!size) { + /* Can't coalesce any more, so do I/O */ nfs_page_group_unlock(req); desc->pg_moreio = 1; nfs_pageio_doio(desc); if (desc->pg_error < 0 || mirror->pg_recoalesce) - goto out_cleanup_subreq; + return 0; /* retry add_request for this subreq */ nfs_page_group_lock(req); continue; } - - /* check for buggy pg_test call(s) */ - WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE); - WARN_ON_ONCE(subreq->wb_bytes > bytes_left); - WARN_ON_ONCE(subreq->wb_bytes == 0); - - bytes_left -= subreq->wb_bytes; - offset += subreq->wb_bytes; - pgbase += subreq->wb_bytes; - - if (bytes_left) { - subreq = nfs_create_subreq(req, subreq, pgbase, - offset, bytes_left); - if (IS_ERR(subreq)) - goto err_ptr; - } - } while (bytes_left > 0); + subreq = nfs_create_subreq(req, req->wb_pgbase, + req->wb_offset, size); + if (IS_ERR(subreq)) + goto err_ptr; + subreq_size = size; + } nfs_page_group_unlock(req); return 1; @@ -1095,10 +1199,6 @@ err_ptr: desc->pg_error = PTR_ERR(subreq); nfs_page_group_unlock(req); return 0; -out_cleanup_subreq: - if (req != subreq) - nfs_pageio_cleanup_request(desc, subreq); - return 0; } static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) @@ -1167,7 +1267,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, { u32 midx; unsigned int pgbase, offset, bytes; - struct nfs_page *dupreq, *lastreq; + struct nfs_page *dupreq; pgbase = req->wb_pgbase; offset = req->wb_offset; @@ -1177,38 +1277,32 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (desc->pg_error < 0) goto out_failed; - for (midx = 0; midx < desc->pg_mirror_count; midx++) { - if (midx) { - nfs_page_group_lock(req); + /* Create the mirror instances first, and fire them off */ + for (midx = 1; midx < desc->pg_mirror_count; midx++) { + nfs_page_group_lock(req); - /* find the last request */ - for (lastreq = req->wb_head; - lastreq->wb_this_page != req->wb_head; - lastreq = lastreq->wb_this_page) - ; + dupreq = nfs_create_subreq(req, + pgbase, offset, bytes); - dupreq = nfs_create_subreq(req, lastreq, - pgbase, offset, bytes); - - nfs_page_group_unlock(req); - if (IS_ERR(dupreq)) { - desc->pg_error = PTR_ERR(dupreq); - goto out_failed; - } - } else - dupreq = req; + nfs_page_group_unlock(req); + if (IS_ERR(dupreq)) { + desc->pg_error = PTR_ERR(dupreq); + goto out_failed; + } - if (nfs_pgio_has_mirroring(desc)) - desc->pg_mirror_idx = midx; + desc->pg_mirror_idx = midx; if (!nfs_pageio_add_request_mirror(desc, dupreq)) goto out_cleanup_subreq; } + desc->pg_mirror_idx = 0; + if (!nfs_pageio_add_request_mirror(desc, req)) + goto out_failed; + return 1; out_cleanup_subreq: - if (req != dupreq) - nfs_pageio_cleanup_request(desc, dupreq); + nfs_pageio_cleanup_request(desc, dupreq); out_failed: nfs_pageio_error_cleanup(desc); return 0; @@ -1226,8 +1320,7 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[mirror_idx]; u32 restore_idx = desc->pg_mirror_idx; - if (nfs_pgio_has_mirroring(desc)) - desc->pg_mirror_idx = mirror_idx; + desc->pg_mirror_idx = mirror_idx; for (;;) { nfs_pageio_doio(desc); if (desc->pg_error < 0 || !mirror->pg_recoalesce) @@ -1320,6 +1413,14 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) } } +/* + * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1) + */ +void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio) +{ + nfs_pageio_complete(pgio); +} + int __init nfs_init_nfspagecache(void) { nfs_page_cachep = kmem_cache_create("nfs_page", diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 542ea8dfd1bc..f2dc35c22964 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -268,11 +268,11 @@ pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo) struct nfs_server *server = NFS_SERVER(lo->plh_inode); struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld; - if (!list_empty(&lo->plh_layouts)) { + if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) { struct nfs_client *clp = server->nfs_client; spin_lock(&clp->cl_lock); - list_del_init(&lo->plh_layouts); + list_del_rcu(&lo->plh_layouts); spin_unlock(&clp->cl_lock); } put_cred(lo->plh_lc_cred); @@ -309,6 +309,16 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) } } +static struct inode * +pnfs_grab_inode_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct inode *inode = igrab(lo->plh_inode); + if (inode) + return inode; + set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags); + return NULL; +} + static void pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, u32 seq) @@ -496,6 +506,7 @@ pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, { INIT_LIST_HEAD(&lseg->pls_list); INIT_LIST_HEAD(&lseg->pls_lc_list); + INIT_LIST_HEAD(&lseg->pls_commits); refcount_set(&lseg->pls_refcount, 1); set_bit(NFS_LSEG_VALID, &lseg->pls_flags); lseg->pls_layout = lo; @@ -782,9 +793,10 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, /* If the sb is being destroyed, just bail */ if (!nfs_sb_active(server->super)) break; - inode = igrab(lo->plh_inode); + inode = pnfs_grab_inode_layout_hdr(lo); if (inode != NULL) { - list_del_init(&lo->plh_layouts); + if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) + list_del_rcu(&lo->plh_layouts); if (pnfs_layout_add_bulk_destroy_list(inode, layout_list)) continue; @@ -794,7 +806,6 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, } else { rcu_read_unlock(); spin_unlock(&clp->cl_lock); - set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags); } nfs_sb_deactive(server->super); spin_lock(&clp->cl_lock); @@ -903,10 +914,21 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) pnfs_destroy_layouts_byclid(clp, false); } +static void +pnfs_set_layout_cred(struct pnfs_layout_hdr *lo, const struct cred *cred) +{ + const struct cred *old; + + if (cred && cred_fscmp(lo->plh_lc_cred, cred) != 0) { + old = xchg(&lo->plh_lc_cred, get_cred(cred)); + put_cred(old); + } +} + /* update lo->plh_stateid with new if is more recent */ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, - bool update_barrier) + const struct cred *cred, bool update_barrier) { u32 oldseq, newseq, new_barrier = 0; @@ -914,6 +936,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, newseq = be32_to_cpu(new->seqid); if (!pnfs_layout_is_valid(lo)) { + pnfs_set_layout_cred(lo, cred); nfs4_stateid_copy(&lo->plh_stateid, new); lo->plh_barrier = newseq; pnfs_clear_layoutreturn_info(lo); @@ -1061,7 +1084,7 @@ pnfs_alloc_init_layoutget_args(struct inode *ino, lgp->args.ctx = get_nfs_open_context(ctx); nfs4_stateid_copy(&lgp->args.stateid, stateid); lgp->gfp_flags = gfp_flags; - lgp->cred = get_cred(ctx->cred); + lgp->cred = ctx->cred; return lgp; } @@ -1072,7 +1095,6 @@ void pnfs_layoutget_free(struct nfs4_layoutget *lgp) nfs4_free_pages(lgp->args.layout.pages, max_pages); if (lgp->args.inode) pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout); - put_cred(lgp->cred); put_nfs_open_context(lgp->args.ctx); kfree(lgp); } @@ -1109,7 +1131,7 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq); pnfs_free_returned_lsegs(lo, &freeme, range, seq); - pnfs_set_layout_stateid(lo, stateid, true); + pnfs_set_layout_stateid(lo, stateid, NULL, true); } else pnfs_mark_layout_stateid_invalid(lo, &freeme); out_unlock: @@ -1122,6 +1144,7 @@ out_unlock: static bool pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, + const struct cred **cred, enum pnfs_iomode *iomode) { /* Serialise LAYOUTGET/LAYOUTRETURN */ @@ -1132,18 +1155,17 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); pnfs_get_layout_hdr(lo); if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) { - if (stateid != NULL) { - nfs4_stateid_copy(stateid, &lo->plh_stateid); - if (lo->plh_return_seq != 0) - stateid->seqid = cpu_to_be32(lo->plh_return_seq); - } + nfs4_stateid_copy(stateid, &lo->plh_stateid); + *cred = get_cred(lo->plh_lc_cred); + if (lo->plh_return_seq != 0) + stateid->seqid = cpu_to_be32(lo->plh_return_seq); if (iomode != NULL) *iomode = lo->plh_return_iomode; pnfs_clear_layoutreturn_info(lo); return true; } - if (stateid != NULL) - nfs4_stateid_copy(stateid, &lo->plh_stateid); + nfs4_stateid_copy(stateid, &lo->plh_stateid); + *cred = get_cred(lo->plh_lc_cred); if (iomode != NULL) *iomode = IOMODE_ANY; return true; @@ -1167,20 +1189,26 @@ pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args, } static int -pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, - enum pnfs_iomode iomode, bool sync) +pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, + const nfs4_stateid *stateid, + const struct cred **pcred, + enum pnfs_iomode iomode, + bool sync) { struct inode *ino = lo->plh_inode; struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; struct nfs4_layoutreturn *lrp; + const struct cred *cred = *pcred; int status = 0; + *pcred = NULL; lrp = kzalloc(sizeof(*lrp), GFP_NOFS); if (unlikely(lrp == NULL)) { status = -ENOMEM; spin_lock(&ino->i_lock); pnfs_clear_layoutreturn_waitbit(lo); spin_unlock(&ino->i_lock); + put_cred(cred); pnfs_put_layout_hdr(lo); goto out; } @@ -1188,7 +1216,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode); lrp->args.ld_private = &lrp->ld_private; lrp->clp = NFS_SERVER(ino)->nfs_client; - lrp->cred = lo->plh_lc_cred; + lrp->cred = cred; if (ld->prepare_layoutreturn) ld->prepare_layoutreturn(&lrp->args); @@ -1233,15 +1261,16 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) return; spin_lock(&inode->i_lock); if (pnfs_layout_need_return(lo)) { + const struct cred *cred; nfs4_stateid stateid; enum pnfs_iomode iomode; bool send; - send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); + send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode); spin_unlock(&inode->i_lock); if (send) { /* Send an async layoutreturn so we dont deadlock */ - pnfs_send_layoutreturn(lo, &stateid, iomode, false); + pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false); } } else spin_unlock(&inode->i_lock); @@ -1261,6 +1290,7 @@ _pnfs_return_layout(struct inode *ino) struct pnfs_layout_hdr *lo = NULL; struct nfs_inode *nfsi = NFS_I(ino); LIST_HEAD(tmp_list); + const struct cred *cred; nfs4_stateid stateid; int status = 0; bool send, valid_layout; @@ -1305,10 +1335,10 @@ _pnfs_return_layout(struct inode *ino) goto out_put_layout_hdr; } - send = pnfs_prepare_layoutreturn(lo, &stateid, NULL); + send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL); spin_unlock(&ino->i_lock); if (send) - status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true); + status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, true); out_put_layout_hdr: pnfs_free_lseg_list(&tmp_list); pnfs_put_layout_hdr(lo); @@ -1354,6 +1384,7 @@ bool pnfs_roc(struct inode *ino, struct nfs4_state *state; struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg, *next; + const struct cred *lc_cred; nfs4_stateid stateid; enum pnfs_iomode iomode = 0; bool layoutreturn = false, roc = false; @@ -1423,16 +1454,20 @@ retry: * 2. we don't send layoutreturn */ /* lo ref dropped in pnfs_roc_release() */ - layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); + layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &lc_cred, &iomode); /* If the creds don't match, we can't compound the layoutreturn */ - if (!layoutreturn || cred_fscmp(cred, lo->plh_lc_cred) != 0) + if (!layoutreturn) goto out_noroc; + if (cred_fscmp(cred, lc_cred) != 0) + goto out_noroc_put_cred; roc = layoutreturn; pnfs_init_layoutreturn_args(args, lo, &stateid, iomode); res->lrs_present = 0; layoutreturn = false; +out_noroc_put_cred: + put_cred(lc_cred); out_noroc: spin_unlock(&ino->i_lock); rcu_read_unlock(); @@ -1445,7 +1480,7 @@ out_noroc: return true; } if (layoutreturn) - pnfs_send_layoutreturn(lo, &stateid, iomode, true); + pnfs_send_layoutreturn(lo, &stateid, &lc_cred, iomode, true); pnfs_put_layout_hdr(lo); return false; } @@ -1859,15 +1894,14 @@ static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo) static void _add_to_server_list(struct pnfs_layout_hdr *lo, struct nfs_server *server) { - if (list_empty(&lo->plh_layouts)) { + if (!test_and_set_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) { struct nfs_client *clp = server->nfs_client; /* The lo must be on the clp list if there is any * chance of a CB_LAYOUTRECALL(FILE) coming in. */ spin_lock(&clp->cl_lock); - if (list_empty(&lo->plh_layouts)) - list_add_tail(&lo->plh_layouts, &server->layouts); + list_add_tail_rcu(&lo->plh_layouts, &server->layouts); spin_unlock(&clp->cl_lock); } } @@ -2323,14 +2357,14 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) if (!pnfs_layout_is_valid(lo)) { /* We have a completely new layout */ - pnfs_set_layout_stateid(lo, &res->stateid, true); + pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true); } else if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { /* existing state ID, make sure the sequence number matches. */ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { dprintk("%s forget reply due to sequence\n", __func__); goto out_forget; } - pnfs_set_layout_stateid(lo, &res->stateid, false); + pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, false); } else { /* * We got an entirely new state ID. Mark all segments for the @@ -2423,43 +2457,159 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, return -ENOENT; } -void pnfs_error_mark_layout_for_return(struct inode *inode, - struct pnfs_layout_segment *lseg) +static void +pnfs_mark_layout_for_return(struct inode *inode, + const struct pnfs_layout_range *range) { - struct pnfs_layout_hdr *lo = NFS_I(inode)->layout; - struct pnfs_layout_range range = { - .iomode = lseg->pls_range.iomode, - .offset = 0, - .length = NFS4_MAX_UINT64, - }; + struct pnfs_layout_hdr *lo; bool return_now = false; spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; if (!pnfs_layout_is_valid(lo)) { spin_unlock(&inode->i_lock); return; } - pnfs_set_plh_return_info(lo, range.iomode, 0); + pnfs_set_plh_return_info(lo, range->iomode, 0); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() * for how it works. */ - if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, &range, 0) != -EBUSY) { + if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, range, 0) != -EBUSY) { + const struct cred *cred; nfs4_stateid stateid; enum pnfs_iomode iomode; - return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); + return_now = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode); spin_unlock(&inode->i_lock); if (return_now) - pnfs_send_layoutreturn(lo, &stateid, iomode, false); + pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false); } else { spin_unlock(&inode->i_lock); nfs_commit_inode(inode, 0); } } + +void pnfs_error_mark_layout_for_return(struct inode *inode, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_range range = { + .iomode = lseg->pls_range.iomode, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + + pnfs_mark_layout_for_return(inode, &range); +} EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); +static bool +pnfs_layout_can_be_returned(struct pnfs_layout_hdr *lo) +{ + return pnfs_layout_is_valid(lo) && + !test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) && + !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); +} + +static struct pnfs_layout_segment * +pnfs_find_first_lseg(struct pnfs_layout_hdr *lo, + const struct pnfs_layout_range *range, + enum pnfs_iomode iomode) +{ + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &lo->plh_segs, pls_list) { + if (!test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) + continue; + if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + continue; + if (lseg->pls_range.iomode != iomode && iomode != IOMODE_ANY) + continue; + if (pnfs_lseg_range_intersecting(&lseg->pls_range, range)) + return lseg; + } + return NULL; +} + +/* Find open file states whose mode matches that of the range */ +static bool +pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo, + const struct pnfs_layout_range *range) +{ + struct list_head *head; + struct nfs_open_context *ctx; + fmode_t mode = 0; + + if (!pnfs_layout_can_be_returned(lo) || + !pnfs_find_first_lseg(lo, range, range->iomode)) + return false; + + head = &NFS_I(lo->plh_inode)->open_files; + list_for_each_entry_rcu(ctx, head, list) { + if (ctx->state) + mode |= ctx->state->state & (FMODE_READ|FMODE_WRITE); + } + + switch (range->iomode) { + default: + break; + case IOMODE_READ: + mode &= ~FMODE_WRITE; + break; + case IOMODE_RW: + if (pnfs_find_first_lseg(lo, range, IOMODE_READ)) + mode &= ~FMODE_READ; + } + return mode == 0; +} + +static int +pnfs_layout_return_unused_byserver(struct nfs_server *server, void *data) +{ + const struct pnfs_layout_range *range = data; + struct pnfs_layout_hdr *lo; + struct inode *inode; +restart: + rcu_read_lock(); + list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) { + if (!pnfs_layout_can_be_returned(lo) || + test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) + continue; + inode = lo->plh_inode; + spin_lock(&inode->i_lock); + if (!pnfs_should_return_unused_layout(lo, range)) { + spin_unlock(&inode->i_lock); + continue; + } + spin_unlock(&inode->i_lock); + inode = pnfs_grab_inode_layout_hdr(lo); + if (!inode) + continue; + rcu_read_unlock(); + pnfs_mark_layout_for_return(inode, range); + iput(inode); + cond_resched(); + goto restart; + } + rcu_read_unlock(); + return 0; +} + +void +pnfs_layout_return_unused_byclid(struct nfs_client *clp, + enum pnfs_iomode iomode) +{ + struct pnfs_layout_range range = { + .iomode = iomode, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + + nfs_client_for_each_server(clp, pnfs_layout_return_unused_byserver, + &range); +} + void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio) { @@ -2475,7 +2625,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout); * Check for any intersection between the request and the pgio->pg_lseg, * and if none, put this pgio->pg_lseg away. */ -static void +void pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) { @@ -2483,6 +2633,7 @@ pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page pgio->pg_lseg = NULL; } } +EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) @@ -3000,10 +3151,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) end_pos = nfsi->layout->plh_lwb; nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); + data->cred = get_cred(nfsi->layout->plh_lc_cred); spin_unlock(&inode->i_lock); data->args.inode = inode; - data->cred = get_cred(nfsi->layout->plh_lc_cred); nfs_fattr_init(&data->fattr); data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; data->res.fattr = &data->fattr; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 0fafdadc9c8d..8e0ada581b92 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -66,6 +66,7 @@ struct nfs4_pnfs_ds { struct pnfs_layout_segment { struct list_head pls_list; struct list_head pls_lc_list; + struct list_head pls_commits; struct pnfs_layout_range pls_range; refcount_t pls_refcount; u32 pls_seq; @@ -105,6 +106,7 @@ enum { NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */ NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */ + NFS_LAYOUT_HASHED, /* The layout visible */ }; enum layoutdriver_policy_flags { @@ -148,22 +150,6 @@ struct pnfs_layoutdriver_type { const struct nfs_pageio_ops *pg_write_ops; struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode); - void (*mark_request_commit) (struct nfs_page *req, - struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo, - u32 ds_commit_idx); - void (*clear_request_commit) (struct nfs_page *req, - struct nfs_commit_info *cinfo); - int (*scan_commit_lists) (struct nfs_commit_info *cinfo, - int max); - void (*recover_commit_reqs) (struct list_head *list, - struct nfs_commit_info *cinfo); - struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo, - struct page *page); - int (*commit_pagelist)(struct inode *inode, - struct list_head *mds_pages, - int how, - struct nfs_commit_info *cinfo); int (*sync)(struct inode *inode, bool datasync); @@ -186,6 +172,29 @@ struct pnfs_layoutdriver_type { int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args); }; +struct pnfs_commit_ops { + void (*setup_ds_info)(struct pnfs_ds_commit_info *, + struct pnfs_layout_segment *); + void (*release_ds_info)(struct pnfs_ds_commit_info *, + struct inode *inode); + int (*commit_pagelist)(struct inode *inode, + struct list_head *mds_pages, + int how, + struct nfs_commit_info *cinfo); + void (*mark_request_commit) (struct nfs_page *req, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + u32 ds_commit_idx); + void (*clear_request_commit) (struct nfs_page *req, + struct nfs_commit_info *cinfo); + int (*scan_commit_lists) (struct nfs_commit_info *cinfo, + int max); + void (*recover_commit_reqs) (struct list_head *list, + struct nfs_commit_info *cinfo); + struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo, + struct page *page); +}; + struct pnfs_layout_hdr { refcount_t plh_refcount; atomic_t plh_outstanding; /* number of RPCs out */ @@ -203,6 +212,7 @@ struct pnfs_layout_hdr { loff_t plh_lwb; /* last write byte for layoutcommit */ const struct cred *plh_lc_cred; /* layoutcommit cred */ struct inode *plh_inode; + struct rcu_head plh_rcu; }; struct pnfs_device { @@ -242,6 +252,7 @@ void pnfs_put_lseg(struct pnfs_layout_segment *lseg); void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *); void unset_pnfs_layoutdriver(struct nfs_server *); void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio); +void pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, @@ -267,6 +278,7 @@ bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, + const struct cred *cred, bool update_barrier); int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, @@ -326,6 +338,9 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *); struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); void pnfs_error_mark_layout_for_return(struct inode *inode, struct pnfs_layout_segment *lseg); +void pnfs_layout_return_unused_byclid(struct nfs_client *clp, + enum pnfs_iomode iomode); + /* nfs4_deviceid_flags */ enum { NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ @@ -360,6 +375,16 @@ bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); void nfs4_deviceid_purge_client(const struct nfs_client *); /* pnfs_nfs.c */ +struct pnfs_commit_array *pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags); +void pnfs_free_commit_array(struct pnfs_commit_array *p); +struct pnfs_commit_array *pnfs_add_commit_array(struct pnfs_ds_commit_info *, + struct pnfs_commit_array *, + struct pnfs_layout_segment *); + +void pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg); +void pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo); + void pnfs_generic_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo); void pnfs_generic_commit_release(void *calldata); @@ -367,6 +392,8 @@ void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data); void pnfs_generic_rw_release(void *data); void pnfs_generic_recover_commit_reqs(struct list_head *dst, struct nfs_commit_info *cinfo); +struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, + struct page *page); int pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, int how, @@ -438,9 +465,11 @@ static inline int pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how, struct nfs_commit_info *cinfo) { - if (cinfo->ds == NULL || cinfo->ds->ncommitting == 0) + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + + if (fl_cinfo == NULL || fl_cinfo->ncommitting == 0) return PNFS_NOT_ATTEMPTED; - return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how, cinfo); + return fl_cinfo->ops->commit_pagelist(inode, mds_pages, how, cinfo); } static inline struct pnfs_ds_commit_info * @@ -454,6 +483,28 @@ pnfs_get_ds_info(struct inode *inode) } static inline void +pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode) +{ + struct pnfs_ds_commit_info *inode_cinfo = pnfs_get_ds_info(inode); + if (inode_cinfo != NULL) + fl_cinfo->ops = inode_cinfo->ops; +} + +static inline void +pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo) +{ + INIT_LIST_HEAD(&fl_cinfo->commits); + fl_cinfo->ops = NULL; +} + +static inline void +pnfs_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode) +{ + if (fl_cinfo->ops != NULL && fl_cinfo->ops->release_ds_info != NULL) + fl_cinfo->ops->release_ds_info(fl_cinfo, inode); +} + +static inline void pnfs_generic_mark_devid_invalid(struct nfs4_deviceid_node *node) { set_bit(NFS_DEVICEID_INVALID, &node->flags); @@ -463,24 +514,22 @@ static inline bool pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx) { - struct inode *inode = d_inode(nfs_req_openctx(req)->dentry); - struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; - if (lseg == NULL || ld->mark_request_commit == NULL) + if (!lseg || !fl_cinfo->ops->mark_request_commit) return false; - ld->mark_request_commit(req, lseg, cinfo, ds_commit_idx); + fl_cinfo->ops->mark_request_commit(req, lseg, cinfo, ds_commit_idx); return true; } static inline bool pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo) { - struct inode *inode = d_inode(nfs_req_openctx(req)->dentry); - struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; - if (ld == NULL || ld->clear_request_commit == NULL) + if (!fl_cinfo || !fl_cinfo->ops || !fl_cinfo->ops->clear_request_commit) return false; - ld->clear_request_commit(req, cinfo); + fl_cinfo->ops->clear_request_commit(req, cinfo); return true; } @@ -488,21 +537,31 @@ static inline int pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo, int max) { - if (cinfo->ds == NULL || cinfo->ds->nwritten == 0) + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + + if (!fl_cinfo || fl_cinfo->nwritten == 0) return 0; - else - return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max); + return fl_cinfo->ops->scan_commit_lists(cinfo, max); +} + +static inline void +pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo) +{ + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + + if (fl_cinfo && fl_cinfo->nwritten != 0) + fl_cinfo->ops->recover_commit_reqs(head, cinfo); } static inline struct nfs_page * pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, struct page *page) { - struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; - if (ld == NULL || ld->search_commit_reqs == NULL) + if (!fl_cinfo->ops || !fl_cinfo->ops->search_commit_reqs) return NULL; - return ld->search_commit_reqs(cinfo, page); + return fl_cinfo->ops->search_commit_reqs(cinfo, page); } /* Should the pNFS client commit and return the layout upon a setattr */ @@ -750,6 +809,21 @@ pnfs_get_ds_info(struct inode *inode) return NULL; } +static inline void +pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode) +{ +} + +static inline void +pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo) +{ +} + +static inline void +pnfs_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode) +{ +} + static inline bool pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx) @@ -770,6 +844,11 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo, return 0; } +static inline void +pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo) +{ +} + static inline struct nfs_page * pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, struct page *page) diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 8b37e7f8e789..e7ddbce48321 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -59,6 +59,17 @@ void pnfs_generic_commit_release(void *calldata) } EXPORT_SYMBOL_GPL(pnfs_generic_commit_release); +static struct pnfs_layout_segment * +pnfs_free_bucket_lseg(struct pnfs_commit_bucket *bucket) +{ + if (list_empty(&bucket->committing) && list_empty(&bucket->written)) { + struct pnfs_layout_segment *freeme = bucket->lseg; + bucket->lseg = NULL; + return freeme; + } + return NULL; +} + /* The generic layer is about to remove the req from the commit list. * If this will make the bucket empty, it will need to put the lseg reference. * Note this must be called holding nfsi->commit_mutex @@ -78,8 +89,7 @@ pnfs_generic_clear_request_commit(struct nfs_page *req, bucket = list_first_entry(&req->wb_list, struct pnfs_commit_bucket, written); - freeme = bucket->wlseg; - bucket->wlseg = NULL; + freeme = pnfs_free_bucket_lseg(bucket); } out: nfs_request_remove_commit_list(req, cinfo); @@ -87,10 +97,154 @@ out: } EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit); +struct pnfs_commit_array * +pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags) +{ + struct pnfs_commit_array *p; + struct pnfs_commit_bucket *b; + + p = kmalloc(struct_size(p, buckets, n), gfp_flags); + if (!p) + return NULL; + p->nbuckets = n; + INIT_LIST_HEAD(&p->cinfo_list); + INIT_LIST_HEAD(&p->lseg_list); + p->lseg = NULL; + for (b = &p->buckets[0]; n != 0; b++, n--) { + INIT_LIST_HEAD(&b->written); + INIT_LIST_HEAD(&b->committing); + b->lseg = NULL; + b->direct_verf.committed = NFS_INVALID_STABLE_HOW; + } + return p; +} +EXPORT_SYMBOL_GPL(pnfs_alloc_commit_array); + +void +pnfs_free_commit_array(struct pnfs_commit_array *p) +{ + kfree_rcu(p, rcu); +} +EXPORT_SYMBOL_GPL(pnfs_free_commit_array); + +static struct pnfs_commit_array * +pnfs_find_commit_array_by_lseg(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_commit_array *array; + + list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) { + if (array->lseg == lseg) + return array; + } + return NULL; +} + +struct pnfs_commit_array * +pnfs_add_commit_array(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_commit_array *new, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_commit_array *array; + + array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg); + if (array) + return array; + new->lseg = lseg; + refcount_set(&new->refcount, 1); + list_add_rcu(&new->cinfo_list, &fl_cinfo->commits); + list_add(&new->lseg_list, &lseg->pls_commits); + return new; +} +EXPORT_SYMBOL_GPL(pnfs_add_commit_array); + +static struct pnfs_commit_array * +pnfs_lookup_commit_array(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_commit_array *array; + + rcu_read_lock(); + array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg); + if (!array) { + rcu_read_unlock(); + fl_cinfo->ops->setup_ds_info(fl_cinfo, lseg); + rcu_read_lock(); + array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg); + } + rcu_read_unlock(); + return array; +} + +static void +pnfs_release_commit_array_locked(struct pnfs_commit_array *array) +{ + list_del_rcu(&array->cinfo_list); + list_del(&array->lseg_list); + pnfs_free_commit_array(array); +} + +static void +pnfs_put_commit_array_locked(struct pnfs_commit_array *array) +{ + if (refcount_dec_and_test(&array->refcount)) + pnfs_release_commit_array_locked(array); +} + +static void +pnfs_put_commit_array(struct pnfs_commit_array *array, struct inode *inode) +{ + if (refcount_dec_and_lock(&array->refcount, &inode->i_lock)) { + pnfs_release_commit_array_locked(array); + spin_unlock(&inode->i_lock); + } +} + +static struct pnfs_commit_array * +pnfs_get_commit_array(struct pnfs_commit_array *array) +{ + if (refcount_inc_not_zero(&array->refcount)) + return array; + return NULL; +} + +static void +pnfs_remove_and_free_commit_array(struct pnfs_commit_array *array) +{ + array->lseg = NULL; + list_del_init(&array->lseg_list); + pnfs_put_commit_array_locked(array); +} + +void +pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_commit_array *array, *tmp; + + list_for_each_entry_safe(array, tmp, &lseg->pls_commits, lseg_list) + pnfs_remove_and_free_commit_array(array); +} +EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_release_lseg); + +void +pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo) +{ + struct pnfs_commit_array *array, *tmp; + + list_for_each_entry_safe(array, tmp, &fl_cinfo->commits, cinfo_list) + pnfs_remove_and_free_commit_array(array); +} +EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_destroy); + +/* + * Locks the nfs_page requests for commit and moves them to + * @bucket->committing. + */ static int -pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, - struct nfs_commit_info *cinfo, - int max) +pnfs_bucket_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, + struct nfs_commit_info *cinfo, + int max) { struct list_head *src = &bucket->written; struct list_head *dst = &bucket->committing; @@ -101,158 +255,254 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, if (ret) { cinfo->ds->nwritten -= ret; cinfo->ds->ncommitting += ret; - if (bucket->clseg == NULL) - bucket->clseg = pnfs_get_lseg(bucket->wlseg); - if (list_empty(src)) { - pnfs_put_lseg(bucket->wlseg); - bucket->wlseg = NULL; - } } return ret; } +static int pnfs_bucket_scan_array(struct nfs_commit_info *cinfo, + struct pnfs_commit_bucket *buckets, + unsigned int nbuckets, + int max) +{ + unsigned int i; + int rv = 0, cnt; + + for (i = 0; i < nbuckets && max != 0; i++) { + cnt = pnfs_bucket_scan_ds_commit_list(&buckets[i], cinfo, max); + rv += cnt; + max -= cnt; + } + return rv; +} + /* Move reqs from written to committing lists, returning count * of number moved. */ -int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, - int max) +int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max) { - int i, rv = 0, cnt; + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + struct pnfs_commit_array *array; + int rv = 0, cnt; - lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); - for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { - cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i], - cinfo, max); - max -= cnt; + rcu_read_lock(); + list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) { + if (!array->lseg || !pnfs_get_commit_array(array)) + continue; + rcu_read_unlock(); + cnt = pnfs_bucket_scan_array(cinfo, array->buckets, + array->nbuckets, max); + rcu_read_lock(); + pnfs_put_commit_array(array, cinfo->inode); rv += cnt; + max -= cnt; + if (!max) + break; } + rcu_read_unlock(); return rv; } EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists); -/* Pull everything off the committing lists and dump into @dst. */ -void pnfs_generic_recover_commit_reqs(struct list_head *dst, - struct nfs_commit_info *cinfo) +static unsigned int +pnfs_bucket_recover_commit_reqs(struct list_head *dst, + struct pnfs_commit_bucket *buckets, + unsigned int nbuckets, + struct nfs_commit_info *cinfo) { struct pnfs_commit_bucket *b; struct pnfs_layout_segment *freeme; - int nwritten; - int i; + unsigned int nwritten, ret = 0; + unsigned int i; - lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); restart: - for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { + for (i = 0, b = buckets; i < nbuckets; i++, b++) { nwritten = nfs_scan_commit_list(&b->written, dst, cinfo, 0); if (!nwritten) continue; - cinfo->ds->nwritten -= nwritten; - if (list_empty(&b->written)) { - freeme = b->wlseg; - b->wlseg = NULL; + ret += nwritten; + freeme = pnfs_free_bucket_lseg(b); + if (freeme) { pnfs_put_lseg(freeme); goto restart; } } + return ret; +} + +/* Pull everything off the committing lists and dump into @dst. */ +void pnfs_generic_recover_commit_reqs(struct list_head *dst, + struct nfs_commit_info *cinfo) +{ + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + struct pnfs_commit_array *array; + unsigned int nwritten; + + lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); + rcu_read_lock(); + list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) { + if (!array->lseg || !pnfs_get_commit_array(array)) + continue; + rcu_read_unlock(); + nwritten = pnfs_bucket_recover_commit_reqs(dst, + array->buckets, + array->nbuckets, + cinfo); + rcu_read_lock(); + pnfs_put_commit_array(array, cinfo->inode); + fl_cinfo->nwritten -= nwritten; + } + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs); -static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) +static struct nfs_page * +pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets, + unsigned int nbuckets, struct page *page) +{ + struct nfs_page *req; + struct pnfs_commit_bucket *b; + unsigned int i; + + /* Linearly search the commit lists for each bucket until a matching + * request is found */ + for (i = 0, b = buckets; i < nbuckets; i++, b++) { + list_for_each_entry(req, &b->written, wb_list) { + if (req->wb_page == page) + return req->wb_head; + } + list_for_each_entry(req, &b->committing, wb_list) { + if (req->wb_page == page) + return req->wb_head; + } + } + return NULL; +} + +/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head reqest + * for @page + * @cinfo - commit info for current inode + * @page - page to search for matching head request + * + * Returns a the head request if one is found, otherwise returns NULL. + */ +struct nfs_page * +pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page) { struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + struct pnfs_commit_array *array; + struct nfs_page *req; + + list_for_each_entry(array, &fl_cinfo->commits, cinfo_list) { + req = pnfs_bucket_search_commit_reqs(array->buckets, + array->nbuckets, page); + if (req) + return req; + } + return NULL; +} +EXPORT_SYMBOL_GPL(pnfs_generic_search_commit_reqs); + +static struct pnfs_layout_segment * +pnfs_bucket_get_committing(struct list_head *head, + struct pnfs_commit_bucket *bucket, + struct nfs_commit_info *cinfo) +{ + struct list_head *pos; + + list_for_each(pos, &bucket->committing) + cinfo->ds->ncommitting--; + list_splice_init(&bucket->committing, head); + return pnfs_free_bucket_lseg(bucket); +} + +static struct nfs_commit_data * +pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket, + struct nfs_commit_info *cinfo) +{ + struct nfs_commit_data *data = nfs_commitdata_alloc(false); + + if (!data) + return NULL; + data->lseg = pnfs_bucket_get_committing(&data->pages, bucket, cinfo); + if (!data->lseg) + data->lseg = pnfs_get_lseg(bucket->lseg); + return data; +} + +static void pnfs_generic_retry_commit(struct pnfs_commit_bucket *buckets, + unsigned int nbuckets, + struct nfs_commit_info *cinfo, + unsigned int idx) +{ struct pnfs_commit_bucket *bucket; struct pnfs_layout_segment *freeme; - struct list_head *pos; LIST_HEAD(pages); - int i; - mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); - for (i = idx; i < fl_cinfo->nbuckets; i++) { - bucket = &fl_cinfo->buckets[i]; + for (bucket = buckets; idx < nbuckets; bucket++, idx++) { if (list_empty(&bucket->committing)) continue; - freeme = bucket->clseg; - bucket->clseg = NULL; - list_for_each(pos, &bucket->committing) - cinfo->ds->ncommitting--; - list_splice_init(&bucket->committing, &pages); + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); + freeme = pnfs_bucket_get_committing(&pages, bucket, cinfo); mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); - nfs_retry_commit(&pages, freeme, cinfo, i); + nfs_retry_commit(&pages, freeme, cinfo, idx); pnfs_put_lseg(freeme); - mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); } - mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); } static unsigned int -pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo, - struct list_head *list) +pnfs_bucket_alloc_ds_commits(struct list_head *list, + struct pnfs_commit_bucket *buckets, + unsigned int nbuckets, + struct nfs_commit_info *cinfo) { - struct pnfs_ds_commit_info *fl_cinfo; struct pnfs_commit_bucket *bucket; struct nfs_commit_data *data; - int i; + unsigned int i; unsigned int nreq = 0; - fl_cinfo = cinfo->ds; - bucket = fl_cinfo->buckets; - for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) { + for (i = 0, bucket = buckets; i < nbuckets; i++, bucket++) { if (list_empty(&bucket->committing)) continue; - data = nfs_commitdata_alloc(false); - if (!data) - break; - data->ds_commit_index = i; - list_add(&data->pages, list); - nreq++; + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); + if (!list_empty(&bucket->committing)) { + data = pnfs_bucket_fetch_commitdata(bucket, cinfo); + if (!data) + goto out_error; + data->ds_commit_index = i; + list_add_tail(&data->list, list); + atomic_inc(&cinfo->mds->rpcs_out); + nreq++; + } + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); } - + return nreq; +out_error: + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); /* Clean up on error */ - pnfs_generic_retry_commit(cinfo, i); + pnfs_generic_retry_commit(buckets, nbuckets, cinfo, i); return nreq; } -static inline -void pnfs_fetch_commit_bucket_list(struct list_head *pages, - struct nfs_commit_data *data, - struct nfs_commit_info *cinfo) +static unsigned int +pnfs_alloc_ds_commits_list(struct list_head *list, + struct pnfs_ds_commit_info *fl_cinfo, + struct nfs_commit_info *cinfo) { - struct pnfs_commit_bucket *bucket; - struct list_head *pos; - - bucket = &cinfo->ds->buckets[data->ds_commit_index]; - mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); - list_for_each(pos, &bucket->committing) - cinfo->ds->ncommitting--; - list_splice_init(&bucket->committing, pages); - data->lseg = bucket->clseg; - bucket->clseg = NULL; - mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); - -} + struct pnfs_commit_array *array; + unsigned int ret = 0; -/* Helper function for pnfs_generic_commit_pagelist to catch an empty - * page list. This can happen when two commits race. - * - * This must be called instead of nfs_init_commit - call one or the other, but - * not both! - */ -static bool -pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages, - struct nfs_commit_data *data, - struct nfs_commit_info *cinfo) -{ - if (list_empty(pages)) { - if (atomic_dec_and_test(&cinfo->mds->rpcs_out)) - wake_up_var(&cinfo->mds->rpcs_out); - /* don't call nfs_commitdata_release - it tries to put - * the open_context which is not acquired until nfs_init_commit - * which has not been called on @data */ - WARN_ON_ONCE(data->context); - nfs_commit_free(data); - return true; + rcu_read_lock(); + list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) { + if (!array->lseg || !pnfs_get_commit_array(array)) + continue; + rcu_read_unlock(); + ret += pnfs_bucket_alloc_ds_commits(list, array->buckets, + array->nbuckets, cinfo); + rcu_read_lock(); + pnfs_put_commit_array(array, cinfo->inode); } - - return false; + rcu_read_unlock(); + return ret; } /* This follows nfs_commit_list pretty closely */ @@ -262,6 +512,7 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, int (*initiate_commit)(struct nfs_commit_data *data, int how)) { + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; struct nfs_commit_data *data, *tmp; LIST_HEAD(list); unsigned int nreq = 0; @@ -269,40 +520,25 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, if (!list_empty(mds_pages)) { data = nfs_commitdata_alloc(true); data->ds_commit_index = -1; - list_add(&data->pages, &list); + list_splice_init(mds_pages, &data->pages); + list_add_tail(&data->list, &list); + atomic_inc(&cinfo->mds->rpcs_out); nreq++; } - nreq += pnfs_generic_alloc_ds_commits(cinfo, &list); - + nreq += pnfs_alloc_ds_commits_list(&list, fl_cinfo, cinfo); if (nreq == 0) goto out; - atomic_add(nreq, &cinfo->mds->rpcs_out); - - list_for_each_entry_safe(data, tmp, &list, pages) { - list_del_init(&data->pages); + list_for_each_entry_safe(data, tmp, &list, list) { + list_del(&data->list); if (data->ds_commit_index < 0) { - /* another commit raced with us */ - if (pnfs_generic_commit_cancel_empty_pagelist(mds_pages, - data, cinfo)) - continue; - - nfs_init_commit(data, mds_pages, NULL, cinfo); + nfs_init_commit(data, NULL, NULL, cinfo); nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(data->inode), data->mds_ops, how, 0); } else { - LIST_HEAD(pages); - - pnfs_fetch_commit_bucket_list(&pages, data, cinfo); - - /* another commit raced with us */ - if (pnfs_generic_commit_cancel_empty_pagelist(&pages, - data, cinfo)) - continue; - - nfs_init_commit(data, &pages, data->lseg, cinfo); + nfs_init_commit(data, NULL, data->lseg, cinfo); initiate_commit(data, how); } } @@ -930,32 +1166,33 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, u32 ds_commit_idx) { struct list_head *list; - struct pnfs_commit_bucket *buckets; + struct pnfs_commit_array *array; + struct pnfs_commit_bucket *bucket; mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); - buckets = cinfo->ds->buckets; - list = &buckets[ds_commit_idx].written; - if (list_empty(list)) { - if (!pnfs_is_valid_lseg(lseg)) { - mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); - cinfo->completion_ops->resched_write(cinfo, req); - return; - } - /* Non-empty buckets hold a reference on the lseg. That ref - * is normally transferred to the COMMIT call and released - * there. It could also be released if the last req is pulled - * off due to a rewrite, in which case it will be done in - * pnfs_common_clear_request_commit - */ - WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL); - buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg); - } + array = pnfs_lookup_commit_array(cinfo->ds, lseg); + if (!array || !pnfs_is_valid_lseg(lseg)) + goto out_resched; + bucket = &array->buckets[ds_commit_idx]; + list = &bucket->written; + /* Non-empty buckets hold a reference on the lseg. That ref + * is normally transferred to the COMMIT call and released + * there. It could also be released if the last req is pulled + * off due to a rewrite, in which case it will be done in + * pnfs_common_clear_request_commit + */ + if (!bucket->lseg) + bucket->lseg = pnfs_get_lseg(lseg); set_bit(PG_COMMIT_TO_DS, &req->wb_flags); cinfo->ds->nwritten++; nfs_request_add_commit_list_locked(req, list, cinfo); mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); nfs_mark_page_unstable(req->wb_page, cinfo); + return; +out_resched: + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); + cinfo->completion_ops->resched_write(cinfo, req); } EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 34bb9add2302..13b22e898116 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -250,7 +250,7 @@ static int nfs_readpage_done(struct rpc_task *task, trace_nfs_readpage_done(task, hdr); if (task->tk_status == -ESTALE) { - set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + nfs_set_inode_stale(inode); nfs_mark_for_revalidate(inode); } return 0; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index dada09b391c6..59ef3b13ccca 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -176,6 +176,41 @@ void nfs_sb_deactive(struct super_block *sb) } EXPORT_SYMBOL_GPL(nfs_sb_deactive); +static int __nfs_list_for_each_server(struct list_head *head, + int (*fn)(struct nfs_server *, void *), + void *data) +{ + struct nfs_server *server, *last = NULL; + int ret = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(server, head, client_link) { + if (!nfs_sb_active(server->super)) + continue; + rcu_read_unlock(); + if (last) + nfs_sb_deactive(last->super); + last = server; + ret = fn(server, data); + if (ret) + goto out; + rcu_read_lock(); + } + rcu_read_unlock(); +out: + if (last) + nfs_sb_deactive(last->super); + return ret; +} + +int nfs_client_for_each_server(struct nfs_client *clp, + int (*fn)(struct nfs_server *, void *), + void *data) +{ + return __nfs_list_for_each_server(&clp->cl_superblocks, fn, data); +} +EXPORT_SYMBOL_GPL(nfs_client_for_each_server); + /* * Deliver file system statistics to userspace */ @@ -1179,7 +1214,6 @@ int nfs_get_tree_common(struct fs_context *fc) struct super_block *s; int (*compare_super)(struct super_block *, struct fs_context *) = nfs_compare_super; struct nfs_server *server = ctx->server; - unsigned long kflags = 0, kflags_out = 0; int error; ctx->server = NULL; @@ -1239,26 +1273,6 @@ int nfs_get_tree_common(struct fs_context *fc) goto error_splat_super; } - if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL) - kflags |= SECURITY_LSM_NATIVE_LABELS; - if (ctx->clone_data.sb) { - if (d_inode(fc->root)->i_fop != &nfs_dir_operations) { - error = -ESTALE; - goto error_splat_root; - } - /* clone any lsm security options from the parent to the new sb */ - error = security_sb_clone_mnt_opts(ctx->clone_data.sb, s, kflags, - &kflags_out); - } else { - error = security_sb_set_mnt_opts(s, fc->security, - kflags, &kflags_out); - } - if (error) - goto error_splat_root; - if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL && - !(kflags_out & SECURITY_LSM_NATIVE_LABELS)) - NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL; - s->s_flags |= SB_ACTIVE; error = 0; @@ -1268,10 +1282,6 @@ out: out_err_nosb: nfs_free_server(server); goto out; - -error_splat_root: - dput(fc->root); - fc->root = NULL; error_splat_super: deactivate_locked_super(s); goto out; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 0effeee28352..b27ebdccef70 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -98,7 +98,7 @@ static void nfs_do_call_unlink(struct inode *inode, struct nfs_unlinkdata *data) .callback_ops = &nfs_unlink_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; struct rpc_task *task; struct inode *dir = d_inode(data->dentry->d_parent); @@ -341,7 +341,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, .callback_ops = &nfs_rename_ops, .workqueue = nfsiod_workqueue, .rpc_client = NFS_CLIENT(old_dir), - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; data = kzalloc(sizeof(*data), GFP_KERNEL); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c478b772cc49..df4b87c30ac9 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -149,6 +149,31 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc) kref_put(&ioc->refcount, nfs_io_completion_release); } +static void +nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode) +{ + if (!test_and_set_bit(PG_INODE_REF, &req->wb_flags)) { + kref_get(&req->wb_kref); + atomic_long_inc(&NFS_I(inode)->nrequests); + } +} + +static int +nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode) +{ + int ret; + + if (!test_bit(PG_REMOVE, &req->wb_flags)) + return 0; + ret = nfs_page_group_lock(req); + if (ret) + return ret; + if (test_and_clear_bit(PG_REMOVE, &req->wb_flags)) + nfs_page_set_inode_ref(req, inode); + nfs_page_group_unlock(req); + return 0; +} + static struct nfs_page * nfs_page_private_request(struct page *page) { @@ -218,6 +243,36 @@ static struct nfs_page *nfs_page_find_head_request(struct page *page) return req; } +static struct nfs_page *nfs_find_and_lock_page_request(struct page *page) +{ + struct inode *inode = page_file_mapping(page)->host; + struct nfs_page *req, *head; + int ret; + + for (;;) { + req = nfs_page_find_head_request(page); + if (!req) + return req; + head = nfs_page_group_lock_head(req); + if (head != req) + nfs_release_request(req); + if (IS_ERR(head)) + return head; + ret = nfs_cancel_remove_inode(head, inode); + if (ret < 0) { + nfs_unlock_and_release_request(head); + return ERR_PTR(ret); + } + /* Ensure that nobody removed the request before we locked it */ + if (head == nfs_page_private_request(page)) + break; + if (PageSwapCache(page)) + break; + nfs_unlock_and_release_request(head); + } + return head; +} + /* Adjust the file length if we're writing beyond the end */ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) { @@ -380,34 +435,6 @@ static void nfs_end_page_writeback(struct nfs_page *req) } /* - * nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req - * - * this is a helper function for nfs_lock_and_join_requests - * - * @inode - inode associated with request page group, must be holding inode lock - * @head - head request of page group, must be holding head lock - * @req - request that couldn't lock and needs to wait on the req bit lock - * - * NOTE: this must be called holding page_group bit lock - * which will be released before returning. - * - * returns 0 on success, < 0 on error. - */ -static void -nfs_unroll_locks(struct inode *inode, struct nfs_page *head, - struct nfs_page *req) -{ - struct nfs_page *tmp; - - /* relinquish all the locks successfully grabbed this run */ - for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) { - if (!kref_read(&tmp->wb_kref)) - continue; - nfs_unlock_and_release_request(tmp); - } -} - -/* * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests * * @destroy_list - request list (using wb_this_page) terminated by @old_head @@ -428,22 +455,29 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, destroy_list = (subreq->wb_this_page == old_head) ? NULL : subreq->wb_this_page; + /* Note: lock subreq in order to change subreq->wb_head */ + nfs_page_set_headlock(subreq); WARN_ON_ONCE(old_head != subreq->wb_head); /* make sure old group is not used */ subreq->wb_this_page = subreq; + subreq->wb_head = subreq; clear_bit(PG_REMOVE, &subreq->wb_flags); /* Note: races with nfs_page_group_destroy() */ if (!kref_read(&subreq->wb_kref)) { /* Check if we raced with nfs_page_group_destroy() */ - if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags)) + if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags)) { + nfs_page_clear_headlock(subreq); nfs_free_request(subreq); + } else + nfs_page_clear_headlock(subreq); continue; } + nfs_page_clear_headlock(subreq); - subreq->wb_head = subreq; + nfs_release_request(old_head); if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) { nfs_release_request(subreq); @@ -457,105 +491,43 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, } /* - * nfs_lock_and_join_requests - join all subreqs to the head req and return - * a locked reference, cancelling any pending - * operations for this page. - * - * @page - the page used to lookup the "page group" of nfs_page structures + * nfs_join_page_group - destroy subrequests of the head req + * @head: the page used to lookup the "page group" of nfs_page structures + * @inode: Inode to which the request belongs. * * This function joins all sub requests to the head request by first * locking all requests in the group, cancelling any pending operations * and finally updating the head request to cover the whole range covered by * the (former) group. All subrequests are removed from any write or commit * lists, unlinked from the group and destroyed. - * - * Returns a locked, referenced pointer to the head request - which after - * this call is guaranteed to be the only request associated with the page. - * Returns NULL if no requests are found for @page, or a ERR_PTR if an - * error was encountered. */ -static struct nfs_page * -nfs_lock_and_join_requests(struct page *page) +void +nfs_join_page_group(struct nfs_page *head, struct inode *inode) { - struct inode *inode = page_file_mapping(page)->host; - struct nfs_page *head, *subreq; + struct nfs_page *subreq; struct nfs_page *destroy_list = NULL; - unsigned int total_bytes; - int ret; + unsigned int pgbase, off, bytes; -try_again: - /* - * A reference is taken only on the head request which acts as a - * reference to the whole page group - the group will not be destroyed - * until the head reference is released. - */ - head = nfs_page_find_head_request(page); - if (!head) - return NULL; - - /* lock the page head first in order to avoid an ABBA inefficiency */ - if (!nfs_lock_request(head)) { - ret = nfs_wait_on_request(head); - nfs_release_request(head); - if (ret < 0) - return ERR_PTR(ret); - goto try_again; - } - - /* Ensure that nobody removed the request before we locked it */ - if (head != nfs_page_private_request(page) && !PageSwapCache(page)) { - nfs_unlock_and_release_request(head); - goto try_again; - } - - ret = nfs_page_group_lock(head); - if (ret < 0) - goto release_request; - - /* lock each request in the page group */ - total_bytes = head->wb_bytes; + pgbase = head->wb_pgbase; + bytes = head->wb_bytes; + off = head->wb_offset; for (subreq = head->wb_this_page; subreq != head; subreq = subreq->wb_this_page) { - - if (!kref_get_unless_zero(&subreq->wb_kref)) { - if (subreq->wb_offset == head->wb_offset + total_bytes) - total_bytes += subreq->wb_bytes; - continue; - } - - while (!nfs_lock_request(subreq)) { - /* - * Unlock page to allow nfs_page_group_sync_on_bit() - * to succeed - */ - nfs_page_group_unlock(head); - ret = nfs_wait_on_request(subreq); - if (!ret) - ret = nfs_page_group_lock(head); - if (ret < 0) { - nfs_unroll_locks(inode, head, subreq); - nfs_release_request(subreq); - goto release_request; - } - } - /* - * Subrequests are always contiguous, non overlapping - * and in order - but may be repeated (mirrored writes). - */ - if (subreq->wb_offset == (head->wb_offset + total_bytes)) { - /* keep track of how many bytes this group covers */ - total_bytes += subreq->wb_bytes; - } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset || - ((subreq->wb_offset + subreq->wb_bytes) > - (head->wb_offset + total_bytes)))) { - nfs_page_group_unlock(head); - nfs_unroll_locks(inode, head, subreq); - nfs_unlock_and_release_request(subreq); - ret = -EIO; - goto release_request; + /* Subrequests should always form a contiguous range */ + if (pgbase > subreq->wb_pgbase) { + off -= pgbase - subreq->wb_pgbase; + bytes += pgbase - subreq->wb_pgbase; + pgbase = subreq->wb_pgbase; } + bytes = max(subreq->wb_pgbase + subreq->wb_bytes + - pgbase, bytes); } + /* Set the head request's range to cover the former page group */ + head->wb_pgbase = pgbase; + head->wb_bytes = bytes; + head->wb_offset = off; + /* Now that all requests are locked, make sure they aren't on any list. * Commit list removal accounting is done after locks are dropped */ subreq = head; @@ -569,36 +541,52 @@ try_again: /* destroy list will be terminated by head */ destroy_list = head->wb_this_page; head->wb_this_page = head; - - /* change head request to cover whole range that - * the former page group covered */ - head->wb_bytes = total_bytes; } - /* Postpone destruction of this request */ - if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) { - set_bit(PG_INODE_REF, &head->wb_flags); - kref_get(&head->wb_kref); - atomic_long_inc(&NFS_I(inode)->nrequests); - } + nfs_destroy_unlinked_subrequests(destroy_list, head, inode); +} - nfs_page_group_unlock(head); +/* + * nfs_lock_and_join_requests - join all subreqs to the head req + * @page: the page used to lookup the "page group" of nfs_page structures + * + * This function joins all sub requests to the head request by first + * locking all requests in the group, cancelling any pending operations + * and finally updating the head request to cover the whole range covered by + * the (former) group. All subrequests are removed from any write or commit + * lists, unlinked from the group and destroyed. + * + * Returns a locked, referenced pointer to the head request - which after + * this call is guaranteed to be the only request associated with the page. + * Returns NULL if no requests are found for @page, or a ERR_PTR if an + * error was encountered. + */ +static struct nfs_page * +nfs_lock_and_join_requests(struct page *page) +{ + struct inode *inode = page_file_mapping(page)->host; + struct nfs_page *head; + int ret; - nfs_destroy_unlinked_subrequests(destroy_list, head, inode); + /* + * A reference is taken only on the head request which acts as a + * reference to the whole page group - the group will not be destroyed + * until the head reference is released. + */ + head = nfs_find_and_lock_page_request(page); + if (IS_ERR_OR_NULL(head)) + return head; - /* Did we lose a race with nfs_inode_remove_request()? */ - if (!(PagePrivate(page) || PageSwapCache(page))) { + /* lock each request in the page group */ + ret = nfs_page_group_lock_subrequests(head); + if (ret < 0) { nfs_unlock_and_release_request(head); - return NULL; + return ERR_PTR(ret); } - /* still holds ref on head from nfs_page_find_head_request - * and still has lock on head from lock loop */ - return head; + nfs_join_page_group(head, inode); -release_request: - nfs_unlock_and_release_request(head); - return ERR_PTR(ret); + return head; } static void nfs_write_error(struct nfs_page *req, int error) @@ -1707,7 +1695,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, .callback_ops = call_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC | flags, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | flags, .priority = priority, }; /* Set up the initial task struct. */ @@ -1746,14 +1734,19 @@ void nfs_init_commit(struct nfs_commit_data *data, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo) { - struct nfs_page *first = nfs_list_entry(head->next); - struct nfs_open_context *ctx = nfs_req_openctx(first); - struct inode *inode = d_inode(ctx->dentry); + struct nfs_page *first; + struct nfs_open_context *ctx; + struct inode *inode; /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ - list_splice_init(head, &data->pages); + if (head) + list_splice_init(head, &data->pages); + + first = nfs_list_entry(data->pages.next); + ctx = nfs_req_openctx(first); + inode = d_inode(ctx->dentry); data->inode = inode; data->cred = ctx->cred; @@ -1869,8 +1862,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* Okay, COMMIT succeeded, apparently. Check the verifier * returned by the server against all stored verfs. */ - if (verf->committed > NFS_UNSTABLE && - !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier)) { + if (nfs_write_match_verf(verf, req)) { /* We have a match */ if (req->wb_page) nfs_inode_remove_request(req); diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index f368f3215f88..99d2cae91bd6 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -136,7 +136,7 @@ config NFSD_FLEXFILELAYOUT config NFSD_V4_2_INTER_SSC bool "NFSv4.2 inter server to server COPY" - depends on NFSD_V4 && NFS_V4_1 && NFS_V4_2 + depends on NFSD_V4 && NFS_V4_1 && NFS_V4_2 && NFS_FS=y help This option enables support for NFSv4.2 inter server to server copy where the destination server calls the NFSv4.2 diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 15422c951fd1..cb777fe82988 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -23,6 +23,7 @@ #include "netns.h" #include "pnfs.h" #include "filecache.h" +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_EXPORT @@ -50,6 +51,11 @@ static void expkey_put(struct kref *ref) kfree_rcu(key, ek_rcu); } +static int expkey_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h); +} + static void expkey_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) @@ -140,7 +146,9 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) if (len == 0) { set_bit(CACHE_NEGATIVE, &key.h.flags); ek = svc_expkey_update(cd, &key, ek); - if (!ek) + if (ek) + trace_nfsd_expkey_update(ek, NULL); + else err = -ENOMEM; } else { err = kern_path(buf, 0, &key.ek_path); @@ -150,7 +158,9 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen) dprintk("Found the path %s\n", buf); ek = svc_expkey_update(cd, &key, ek); - if (!ek) + if (ek) + trace_nfsd_expkey_update(ek, buf); + else err = -ENOMEM; path_put(&key.ek_path); } @@ -249,6 +259,7 @@ static const struct cache_detail svc_expkey_cache_template = { .hash_size = EXPKEY_HASHMAX, .name = "nfsd.fh", .cache_put = expkey_put, + .cache_upcall = expkey_upcall, .cache_request = expkey_request, .cache_parse = expkey_parse, .cache_show = expkey_show, @@ -330,6 +341,11 @@ static void svc_export_put(struct kref *ref) kfree_rcu(exp, ex_rcu); } +static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h); +} + static void svc_export_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) @@ -643,15 +659,17 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) } expp = svc_export_lookup(&exp); - if (expp) - expp = svc_export_update(&exp, expp); - else - err = -ENOMEM; - cache_flush(); - if (expp == NULL) + if (!expp) { err = -ENOMEM; - else + goto out4; + } + expp = svc_export_update(&exp, expp); + if (expp) { + trace_nfsd_export_update(expp); + cache_flush(); exp_put(expp); + } else + err = -ENOMEM; out4: nfsd4_fslocs_free(&exp.ex_fslocs); kfree(exp.ex_uuid); @@ -767,6 +785,7 @@ static const struct cache_detail svc_export_cache_template = { .hash_size = EXPORT_HASHMAX, .name = "nfsd.export", .cache_put = svc_export_put, + .cache_upcall = svc_export_upcall, .cache_request = svc_export_request, .cache_parse = svc_export_parse, .cache_show = svc_export_show, @@ -832,8 +851,10 @@ exp_find_key(struct cache_detail *cd, struct auth_domain *clp, int fsid_type, if (ek == NULL) return ERR_PTR(-ENOMEM); err = cache_check(cd, &ek->h, reqp); - if (err) + if (err) { + trace_nfsd_exp_find_key(&key, err); return ERR_PTR(err); + } return ek; } @@ -855,8 +876,10 @@ exp_get_by_name(struct cache_detail *cd, struct auth_domain *clp, if (exp == NULL) return ERR_PTR(-ENOMEM); err = cache_check(cd, &exp->h, reqp); - if (err) + if (err) { + trace_nfsd_exp_get_by_name(&key, err); return ERR_PTR(err); + } return exp; } diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 22e77ede9f14..82198d747c4c 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -890,7 +890,7 @@ nfsd_file_find_locked(struct inode *inode, unsigned int may_flags, unsigned char need = may_flags & NFSD_FILE_MAY_MASK; hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head, - nf_node) { + nf_node, lockdep_is_held(&nfsd_file_hashtbl[hashval].nfb_lock)) { if ((need & nf->nf_may) != need) continue; if (nf->nf_inode != inode) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 2baf32311e00..09aa545825bd 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -172,6 +172,8 @@ struct nfsd_net { unsigned int longest_chain_cachesize; struct shrinker nfsd_reply_cache_shrinker; + /* utsname taken from the the process that starts the server */ + char nfsd_name[UNX_MAXNODENAME+1]; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index d1f285245af8..9460be8a8321 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -122,6 +122,12 @@ idtoname_hash(struct ent *ent) return hash; } +static int +idtoname_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall_timeout(cd, h); +} + static void idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, int *blen) @@ -184,6 +190,7 @@ static const struct cache_detail idtoname_cache_template = { .hash_size = ENT_HASHMAX, .name = "nfs4.idtoname", .cache_put = ent_put, + .cache_upcall = idtoname_upcall, .cache_request = idtoname_request, .cache_parse = idtoname_parse, .cache_show = idtoname_show, @@ -295,6 +302,12 @@ nametoid_hash(struct ent *ent) return hash_str(ent->name, ENT_HASHBITS); } +static int +nametoid_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall_timeout(cd, h); +} + static void nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, int *blen) @@ -347,6 +360,7 @@ static const struct cache_detail nametoid_cache_template = { .hash_size = ENT_HASHMAX, .name = "nfs4.nametoid", .cache_put = ent_put, + .cache_upcall = nametoid_upcall, .cache_request = nametoid_request, .cache_parse = nametoid_parse, .cache_show = nametoid_show, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 65cfe9ab47be..e32ecedece0f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -494,6 +494,8 @@ find_any_file(struct nfs4_file *f) { struct nfsd_file *ret; + if (!f) + return NULL; spin_lock(&f->fi_lock); ret = __nfs4_get_fd(f, O_RDWR); if (!ret) { @@ -1309,6 +1311,12 @@ static void nfs4_put_stateowner(struct nfs4_stateowner *sop) nfs4_free_stateowner(sop); } +static bool +nfs4_ol_stateid_unhashed(const struct nfs4_ol_stateid *stp) +{ + return list_empty(&stp->st_perfile); +} + static bool unhash_ol_stateid(struct nfs4_ol_stateid *stp) { struct nfs4_file *fp = stp->st_stid.sc_file; @@ -1379,9 +1387,11 @@ static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp) { lockdep_assert_held(&stp->st_stid.sc_client->cl_lock); + if (!unhash_ol_stateid(stp)) + return false; list_del_init(&stp->st_locks); nfs4_unhash_stid(&stp->st_stid); - return unhash_ol_stateid(stp); + return true; } static void release_lock_stateid(struct nfs4_ol_stateid *stp) @@ -1446,13 +1456,12 @@ static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp, static bool unhash_open_stateid(struct nfs4_ol_stateid *stp, struct list_head *reaplist) { - bool unhashed; - lockdep_assert_held(&stp->st_stid.sc_client->cl_lock); - unhashed = unhash_ol_stateid(stp); + if (!unhash_ol_stateid(stp)) + return false; release_open_stateid_locks(stp, reaplist); - return unhashed; + return true; } static void release_open_stateid(struct nfs4_ol_stateid *stp) @@ -2636,7 +2645,7 @@ static const struct file_operations client_ctl_fops = { static const struct tree_descr client_files[] = { [0] = {"info", &client_info_fops, S_IRUSR}, [1] = {"states", &client_states_fops, S_IRUSR}, - [2] = {"ctl", &client_ctl_fops, S_IRUSR|S_IWUSR}, + [2] = {"ctl", &client_ctl_fops, S_IWUSR}, [3] = {""}, }; @@ -4343,7 +4352,8 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval) { struct nfs4_file *fp; - hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) { + hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash, + lockdep_is_held(&state_lock)) { if (fh_match(&fp->fi_fhandle, fh)) { if (refcount_inc_not_zero(&fp->fi_ref)) return fp; @@ -5521,15 +5531,8 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) return status; - /* Client debugging aid. */ - if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) { - char addr_str[INET6_ADDRSTRLEN]; - rpc_ntop((struct sockaddr *)&cl->cl_addr, addr_str, - sizeof(addr_str)); - pr_warn_ratelimited("NFSD: client %s testing state ID " - "with incorrect client ID\n", addr_str); + if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) return status; - } spin_lock(&cl->cl_lock); s = find_stateid_locked(cl, stateid); if (!s) @@ -6393,21 +6396,21 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, } static struct nfs4_ol_stateid * -find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp) +find_lock_stateid(const struct nfs4_lockowner *lo, + const struct nfs4_ol_stateid *ost) { struct nfs4_ol_stateid *lst; - struct nfs4_client *clp = lo->lo_owner.so_client; - lockdep_assert_held(&clp->cl_lock); + lockdep_assert_held(&ost->st_stid.sc_client->cl_lock); - list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) { - if (lst->st_stid.sc_type != NFS4_LOCK_STID) - continue; - if (lst->st_stid.sc_file == fp) { - refcount_inc(&lst->st_stid.sc_count); - return lst; + /* If ost is not hashed, ost->st_locks will not be valid */ + if (!nfs4_ol_stateid_unhashed(ost)) + list_for_each_entry(lst, &ost->st_locks, st_locks) { + if (lst->st_stateowner == &lo->lo_owner) { + refcount_inc(&lst->st_stid.sc_count); + return lst; + } } - } return NULL; } @@ -6423,11 +6426,11 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX); retry: spin_lock(&clp->cl_lock); - spin_lock(&fp->fi_lock); - retstp = find_lock_stateid(lo, fp); + if (nfs4_ol_stateid_unhashed(open_stp)) + goto out_close; + retstp = find_lock_stateid(lo, open_stp); if (retstp) - goto out_unlock; - + goto out_found; refcount_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = NFS4_LOCK_STID; stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner); @@ -6436,22 +6439,26 @@ retry: stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; + spin_lock(&fp->fi_lock); list_add(&stp->st_locks, &open_stp->st_locks); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); -out_unlock: spin_unlock(&fp->fi_lock); spin_unlock(&clp->cl_lock); - if (retstp) { - if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) { - nfs4_put_stid(&retstp->st_stid); - goto retry; - } - /* To keep mutex tracking happy */ - mutex_unlock(&stp->st_mutex); - stp = retstp; - } return stp; +out_found: + spin_unlock(&clp->cl_lock); + if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) { + nfs4_put_stid(&retstp->st_stid); + goto retry; + } + /* To keep mutex tracking happy */ + mutex_unlock(&stp->st_mutex); + return retstp; +out_close: + spin_unlock(&clp->cl_lock); + mutex_unlock(&stp->st_mutex); + return NULL; } static struct nfs4_ol_stateid * @@ -6466,7 +6473,7 @@ find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi, *new = false; spin_lock(&clp->cl_lock); - lst = find_lock_stateid(lo, fi); + lst = find_lock_stateid(lo, ost); spin_unlock(&clp->cl_lock); if (lst != NULL) { if (nfsd4_lock_ol_stateid(lst) == nfs_ok) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 9761512674a0..996ac01ee977 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3591,23 +3591,22 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, __be32 nfserr; __be32 tmp; __be32 *p; - u32 zzz = 0; int pad; + /* + * svcrdma requires every READ payload to start somewhere + * in xdr->pages. + */ + if (xdr->iov == xdr->buf->head) { + xdr->iov = NULL; + xdr->end = xdr->p; + } + len = maxcount; v = 0; - - thislen = min_t(long, len, ((void *)xdr->end - (void *)xdr->p)); - p = xdr_reserve_space(xdr, (thislen+3)&~3); - WARN_ON_ONCE(!p); - resp->rqstp->rq_vec[v].iov_base = p; - resp->rqstp->rq_vec[v].iov_len = thislen; - v++; - len -= thislen; - while (len) { thislen = min_t(long, len, PAGE_SIZE); - p = xdr_reserve_space(xdr, (thislen+3)&~3); + p = xdr_reserve_space(xdr, thislen); WARN_ON_ONCE(!p); resp->rqstp->rq_vec[v].iov_base = p; resp->rqstp->rq_vec[v].iov_len = thislen; @@ -3616,23 +3615,25 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, } read->rd_vlen = v; - len = maxcount; nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, &maxcount, &eof); read->rd_length = maxcount; if (nfserr) return nfserr; - xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); + if (svc_encode_read_payload(resp->rqstp, starting_len + 8, maxcount)) + return nfserr_io; + xdr_truncate_encode(xdr, starting_len + 8 + xdr_align_size(maxcount)); tmp = htonl(eof); write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4); tmp = htonl(maxcount); write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4); + tmp = xdr_zero; pad = (maxcount&3) ? 4 - (maxcount&3) : 0; write_bytes_to_xdr_buf(xdr->buf, starting_len + 8 + maxcount, - &zzz, pad); + &tmp, pad); return 0; } @@ -4005,11 +4006,12 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, int major_id_sz; int server_scope_sz; uint64_t minor_id = 0; + struct nfsd_net *nn = net_generic(SVC_NET(resp->rqstp), nfsd_net_id); - major_id = utsname()->nodename; - major_id_sz = strlen(major_id); - server_scope = utsname()->nodename; - server_scope_sz = strlen(server_scope); + major_id = nn->nfsd_name; + major_id_sz = strlen(nn->nfsd_name); + server_scope = nn->nfsd_name; + server_scope_sz = strlen(nn->nfsd_name); p = xdr_reserve_space(xdr, 8 /* eir_clientid */ + diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index e109a1007704..3bb2db947d29 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1333,6 +1333,7 @@ void nfsd_client_rmdir(struct dentry *dentry) dget(dentry); ret = simple_rmdir(dir, dentry); WARN_ON_ONCE(ret); + fsnotify_rmdir(dir, dentry); d_delete(dentry); inode_unlock(dir); } diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index b319080288c3..37bc8f5f4514 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -14,6 +14,7 @@ #include "nfsd.h" #include "vfs.h" #include "auth.h" +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_FH @@ -209,11 +210,14 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) } error = nfserr_stale; - if (PTR_ERR(exp) == -ENOENT) - return error; + if (IS_ERR(exp)) { + trace_nfsd_set_fh_dentry_badexport(rqstp, fhp, PTR_ERR(exp)); + + if (PTR_ERR(exp) == -ENOENT) + return error; - if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); + } if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) { /* Elevate privileges so that the lack of 'r' or 'x' @@ -267,6 +271,9 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) dentry = exportfs_decode_fh(exp->ex_path.mnt, fid, data_left, fileid_type, nfsd_acceptable, exp); + if (IS_ERR_OR_NULL(dentry)) + trace_nfsd_set_fh_dentry_badhandle(rqstp, fhp, + dentry ? PTR_ERR(dentry) : -ESTALE); } if (dentry == NULL) goto out; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 3b77b904212d..ca9fd348548b 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -749,6 +749,9 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) if (nrservs == 0 && nn->nfsd_serv == NULL) goto out; + strlcpy(nn->nfsd_name, utsname()->nodename, + sizeof(nn->nfsd_name)); + error = nfsd_create_serv(net); if (error) goto out; diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 06dd0d337049..78c574251c60 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -9,6 +9,7 @@ #define _NFSD_TRACE_H #include <linux/tracepoint.h> +#include "export.h" #include "nfsfh.h" TRACE_EVENT(nfsd_compound, @@ -50,6 +51,127 @@ TRACE_EVENT(nfsd_compound_status, __get_str(name), __entry->status) ) +DECLARE_EVENT_CLASS(nfsd_fh_err_class, + TP_PROTO(struct svc_rqst *rqstp, + struct svc_fh *fhp, + int status), + TP_ARGS(rqstp, fhp, status), + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, fh_hash) + __field(int, status) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->status = status; + ), + TP_printk("xid=0x%08x fh_hash=0x%08x status=%d", + __entry->xid, __entry->fh_hash, + __entry->status) +) + +#define DEFINE_NFSD_FH_ERR_EVENT(name) \ +DEFINE_EVENT(nfsd_fh_err_class, nfsd_##name, \ + TP_PROTO(struct svc_rqst *rqstp, \ + struct svc_fh *fhp, \ + int status), \ + TP_ARGS(rqstp, fhp, status)) + +DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badexport); +DEFINE_NFSD_FH_ERR_EVENT(set_fh_dentry_badhandle); + +TRACE_EVENT(nfsd_exp_find_key, + TP_PROTO(const struct svc_expkey *key, + int status), + TP_ARGS(key, status), + TP_STRUCT__entry( + __field(int, fsidtype) + __array(u32, fsid, 6) + __string(auth_domain, key->ek_client->name) + __field(int, status) + ), + TP_fast_assign( + __entry->fsidtype = key->ek_fsidtype; + memcpy(__entry->fsid, key->ek_fsid, 4*6); + __assign_str(auth_domain, key->ek_client->name); + __entry->status = status; + ), + TP_printk("fsid=%x::%s domain=%s status=%d", + __entry->fsidtype, + __print_array(__entry->fsid, 6, 4), + __get_str(auth_domain), + __entry->status + ) +); + +TRACE_EVENT(nfsd_expkey_update, + TP_PROTO(const struct svc_expkey *key, const char *exp_path), + TP_ARGS(key, exp_path), + TP_STRUCT__entry( + __field(int, fsidtype) + __array(u32, fsid, 6) + __string(auth_domain, key->ek_client->name) + __string(path, exp_path) + __field(bool, cache) + ), + TP_fast_assign( + __entry->fsidtype = key->ek_fsidtype; + memcpy(__entry->fsid, key->ek_fsid, 4*6); + __assign_str(auth_domain, key->ek_client->name); + __assign_str(path, exp_path); + __entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags); + ), + TP_printk("fsid=%x::%s domain=%s path=%s cache=%s", + __entry->fsidtype, + __print_array(__entry->fsid, 6, 4), + __get_str(auth_domain), + __get_str(path), + __entry->cache ? "pos" : "neg" + ) +); + +TRACE_EVENT(nfsd_exp_get_by_name, + TP_PROTO(const struct svc_export *key, + int status), + TP_ARGS(key, status), + TP_STRUCT__entry( + __string(path, key->ex_path.dentry->d_name.name) + __string(auth_domain, key->ex_client->name) + __field(int, status) + ), + TP_fast_assign( + __assign_str(path, key->ex_path.dentry->d_name.name); + __assign_str(auth_domain, key->ex_client->name); + __entry->status = status; + ), + TP_printk("path=%s domain=%s status=%d", + __get_str(path), + __get_str(auth_domain), + __entry->status + ) +); + +TRACE_EVENT(nfsd_export_update, + TP_PROTO(const struct svc_export *key), + TP_ARGS(key), + TP_STRUCT__entry( + __string(path, key->ex_path.dentry->d_name.name) + __string(auth_domain, key->ex_client->name) + __field(bool, cache) + ), + TP_fast_assign( + __assign_str(path, key->ex_path.dentry->d_name.name); + __assign_str(auth_domain, key->ex_client->name); + __entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags); + ), + TP_printk("path=%s domain=%s cache=%s", + __get_str(path), + __get_str(auth_domain), + __entry->cache ? "pos" : "neg" + ) +); + DECLARE_EVENT_CLASS(nfsd_io_class, TP_PROTO(struct svc_rqst *rqstp, struct svc_fh *fhp, diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 5778d1347b35..5435a40f82be 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -17,6 +17,59 @@ #include "fanotify.h" +static bool fanotify_path_equal(struct path *p1, struct path *p2) +{ + return p1->mnt == p2->mnt && p1->dentry == p2->dentry; +} + +static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1, + __kernel_fsid_t *fsid2) +{ + return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1]; +} + +static bool fanotify_fh_equal(struct fanotify_fh *fh1, + struct fanotify_fh *fh2) +{ + if (fh1->type != fh2->type || fh1->len != fh2->len) + return false; + + /* Do not merge events if we failed to encode fh */ + if (fh1->type == FILEID_INVALID) + return false; + + return !fh1->len || + !memcmp(fanotify_fh_buf(fh1), fanotify_fh_buf(fh2), fh1->len); +} + +static bool fanotify_fid_event_equal(struct fanotify_fid_event *ffe1, + struct fanotify_fid_event *ffe2) +{ + /* Do not merge fid events without object fh */ + if (!ffe1->object_fh.len) + return false; + + return fanotify_fsid_equal(&ffe1->fsid, &ffe2->fsid) && + fanotify_fh_equal(&ffe1->object_fh, &ffe2->object_fh); +} + +static bool fanotify_name_event_equal(struct fanotify_name_event *fne1, + struct fanotify_name_event *fne2) +{ + /* + * Do not merge name events without dir fh. + * FAN_DIR_MODIFY does not encode object fh, so it may be empty. + */ + if (!fne1->dir_fh.len) + return false; + + if (fne1->name_len != fne2->name_len || + !fanotify_fh_equal(&fne1->dir_fh, &fne2->dir_fh)) + return false; + + return !memcmp(fne1->name, fne2->name, fne1->name_len); +} + static bool should_merge(struct fsnotify_event *old_fsn, struct fsnotify_event *new_fsn) { @@ -26,14 +79,15 @@ static bool should_merge(struct fsnotify_event *old_fsn, old = FANOTIFY_E(old_fsn); new = FANOTIFY_E(new_fsn); - if (old_fsn->inode != new_fsn->inode || old->pid != new->pid || - old->fh_type != new->fh_type || old->fh_len != new->fh_len) + if (old_fsn->objectid != new_fsn->objectid || + old->type != new->type || old->pid != new->pid) return false; - if (fanotify_event_has_path(old)) { - return old->path.mnt == new->path.mnt && - old->path.dentry == new->path.dentry; - } else if (fanotify_event_has_fid(old)) { + switch (old->type) { + case FANOTIFY_EVENT_TYPE_PATH: + return fanotify_path_equal(fanotify_event_path(old), + fanotify_event_path(new)); + case FANOTIFY_EVENT_TYPE_FID: /* * We want to merge many dirent events in the same dir (i.e. * creates/unlinks/renames), but we do not want to merge dirent @@ -42,11 +96,18 @@ static bool should_merge(struct fsnotify_event *old_fsn, * mask FAN_CREATE|FAN_DELETE|FAN_ONDIR if it describes mkdir+ * unlink pair or rmdir+create pair of events. */ - return (old->mask & FS_ISDIR) == (new->mask & FS_ISDIR) && - fanotify_fid_equal(&old->fid, &new->fid, old->fh_len); + if ((old->mask & FS_ISDIR) != (new->mask & FS_ISDIR)) + return false; + + return fanotify_fid_event_equal(FANOTIFY_FE(old), + FANOTIFY_FE(new)); + case FANOTIFY_EVENT_TYPE_FID_NAME: + return fanotify_name_event_equal(FANOTIFY_NE(old), + FANOTIFY_NE(new)); + default: + WARN_ON_ONCE(1); } - /* Do not merge events if we failed to encode fid */ return false; } @@ -151,7 +212,7 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, { __u32 marks_mask = 0, marks_ignored_mask = 0; __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS; - const struct path *path = data; + const struct path *path = fsnotify_data_path(data, data_type); struct fsnotify_mark *mark; int type; @@ -160,7 +221,7 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, if (!FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { /* Do we have path to open a file descriptor? */ - if (data_type != FSNOTIFY_EVENT_PATH) + if (!path) return 0; /* Path type events are only relevant for files and dirs */ if (!d_is_reg(path->dentry) && !d_can_lookup(path->dentry)) @@ -172,6 +233,13 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, continue; mark = iter_info->marks[type]; /* + * If the event is on dir and this mark doesn't care about + * events on dir, don't send it! + */ + if (event_mask & FS_ISDIR && !(mark->mask & FS_ISDIR)) + continue; + + /* * If the event is for a child and this mark doesn't care about * events on a child, don't send it! */ @@ -187,9 +255,9 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, test_mask = event_mask & marks_mask & ~marks_ignored_mask; /* - * dirent modification events (create/delete/move) do not carry the - * child entry name/inode information. Instead, we report FAN_ONDIR - * for mkdir/rmdir so user can differentiate them from creat/unlink. + * For dirent modification events (create/delete/move) that do not carry + * the child entry name information, we report FAN_ONDIR for mkdir/rmdir + * so user can differentiate them from creat/unlink. * * For backward compatibility and consistency, do not report FAN_ONDIR * to user in legacy fanotify mode (reporting fd) and report FAN_ONDIR @@ -203,22 +271,20 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, user_mask &= ~FAN_ONDIR; } - if (event_mask & FS_ISDIR && - !(marks_mask & FS_ISDIR & ~marks_ignored_mask)) - return 0; - return test_mask & user_mask; } -static int fanotify_encode_fid(struct fanotify_event *event, - struct inode *inode, gfp_t gfp, - __kernel_fsid_t *fsid) +static void fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, + gfp_t gfp) { - struct fanotify_fid *fid = &event->fid; - int dwords, bytes = 0; - int err, type; + int dwords, type, bytes = 0; + char *ext_buf = NULL; + void *buf = fh->buf; + int err; + + if (!inode) + goto out; - fid->ext_fh = NULL; dwords = 0; err = -ENOENT; type = exportfs_encode_inode_fh(inode, NULL, &dwords, NULL); @@ -229,31 +295,33 @@ static int fanotify_encode_fid(struct fanotify_event *event, if (bytes > FANOTIFY_INLINE_FH_LEN) { /* Treat failure to allocate fh as failure to allocate event */ err = -ENOMEM; - fid->ext_fh = kmalloc(bytes, gfp); - if (!fid->ext_fh) + ext_buf = kmalloc(bytes, gfp); + if (!ext_buf) goto out_err; + + *fanotify_fh_ext_buf_ptr(fh) = ext_buf; + buf = ext_buf; } - type = exportfs_encode_inode_fh(inode, fanotify_fid_fh(fid, bytes), - &dwords, NULL); + type = exportfs_encode_inode_fh(inode, buf, &dwords, NULL); err = -EINVAL; if (!type || type == FILEID_INVALID || bytes != dwords << 2) goto out_err; - fid->fsid = *fsid; - event->fh_len = bytes; + fh->type = type; + fh->len = bytes; - return type; + return; out_err: - pr_warn_ratelimited("fanotify: failed to encode fid (fsid=%x.%x, " - "type=%d, bytes=%d, err=%i)\n", - fsid->val[0], fsid->val[1], type, bytes, err); - kfree(fid->ext_fh); - fid->ext_fh = NULL; - event->fh_len = 0; - - return FILEID_INVALID; + pr_warn_ratelimited("fanotify: failed to encode fid (type=%d, len=%d, err=%i)\n", + type, bytes, err); + kfree(ext_buf); + *fanotify_fh_ext_buf_ptr(fh) = NULL; +out: + /* Report the event without a file identifier on encode error */ + fh->type = FILEID_INVALID; + fh->len = 0; } /* @@ -269,21 +337,22 @@ static struct inode *fanotify_fid_inode(struct inode *to_tell, u32 event_mask, { if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS) return to_tell; - else if (data_type == FSNOTIFY_EVENT_INODE) - return (struct inode *)data; - else if (data_type == FSNOTIFY_EVENT_PATH) - return d_inode(((struct path *)data)->dentry); - return NULL; + + return (struct inode *)fsnotify_data_inode(data, data_type); } struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, struct inode *inode, u32 mask, const void *data, int data_type, + const struct qstr *file_name, __kernel_fsid_t *fsid) { struct fanotify_event *event = NULL; + struct fanotify_fid_event *ffe = NULL; + struct fanotify_name_event *fne = NULL; gfp_t gfp = GFP_KERNEL_ACCOUNT; struct inode *id = fanotify_fid_inode(inode, mask, data, data_type); + const struct path *path = fsnotify_data_path(data, data_type); /* * For queues with unlimited length lost events are not expected and @@ -305,33 +374,81 @@ struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, pevent = kmem_cache_alloc(fanotify_perm_event_cachep, gfp); if (!pevent) goto out; + event = &pevent->fae; + event->type = FANOTIFY_EVENT_TYPE_PATH_PERM; pevent->response = 0; pevent->state = FAN_EVENT_INIT; goto init; } - event = kmem_cache_alloc(fanotify_event_cachep, gfp); - if (!event) - goto out; -init: __maybe_unused - fsnotify_init_event(&event->fse, inode); + + /* + * For FAN_DIR_MODIFY event, we report the fid of the directory and + * the name of the modified entry. + * Allocate an fanotify_name_event struct and copy the name. + */ + if (mask & FAN_DIR_MODIFY && !(WARN_ON_ONCE(!file_name))) { + fne = kmalloc(sizeof(*fne) + file_name->len + 1, gfp); + if (!fne) + goto out; + + event = &fne->fae; + event->type = FANOTIFY_EVENT_TYPE_FID_NAME; + fne->name_len = file_name->len; + strcpy(fne->name, file_name->name); + goto init; + } + + if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { + ffe = kmem_cache_alloc(fanotify_fid_event_cachep, gfp); + if (!ffe) + goto out; + + event = &ffe->fae; + event->type = FANOTIFY_EVENT_TYPE_FID; + } else { + struct fanotify_path_event *pevent; + + pevent = kmem_cache_alloc(fanotify_path_event_cachep, gfp); + if (!pevent) + goto out; + + event = &pevent->fae; + event->type = FANOTIFY_EVENT_TYPE_PATH; + } + +init: + /* + * Use the victim inode instead of the watching inode as the id for + * event queue, so event reported on parent is merged with event + * reported on child when both directory and child watches exist. + */ + fsnotify_init_event(&event->fse, (unsigned long)id); event->mask = mask; if (FAN_GROUP_FLAG(group, FAN_REPORT_TID)) event->pid = get_pid(task_pid(current)); else event->pid = get_pid(task_tgid(current)); - event->fh_len = 0; - if (id && FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { - /* Report the event without a file identifier on encode error */ - event->fh_type = fanotify_encode_fid(event, id, gfp, fsid); - } else if (data_type == FSNOTIFY_EVENT_PATH) { - event->fh_type = FILEID_ROOT; - event->path = *((struct path *)data); - path_get(&event->path); - } else { - event->fh_type = FILEID_INVALID; - event->path.mnt = NULL; - event->path.dentry = NULL; + + if (fsid && fanotify_event_fsid(event)) + *fanotify_event_fsid(event) = *fsid; + + if (fanotify_event_object_fh(event)) + fanotify_encode_fh(fanotify_event_object_fh(event), id, gfp); + + if (fanotify_event_dir_fh(event)) + fanotify_encode_fh(fanotify_event_dir_fh(event), id, gfp); + + if (fanotify_event_has_path(event)) { + struct path *p = fanotify_event_path(event); + + if (path) { + *p = *path; + path_get(path); + } else { + p->mnt = NULL; + p->dentry = NULL; + } } out: memalloc_unuse_memcg(); @@ -392,6 +509,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_MOVED_FROM != FS_MOVED_FROM); BUILD_BUG_ON(FAN_CREATE != FS_CREATE); BUILD_BUG_ON(FAN_DELETE != FS_DELETE); + BUILD_BUG_ON(FAN_DIR_MODIFY != FS_DIR_MODIFY); BUILD_BUG_ON(FAN_DELETE_SELF != FS_DELETE_SELF); BUILD_BUG_ON(FAN_MOVE_SELF != FS_MOVE_SELF); BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); @@ -402,7 +520,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC); BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 20); mask = fanotify_group_event_mask(group, iter_info, mask, data, data_type); @@ -429,7 +547,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, } event = fanotify_alloc_event(group, inode, mask, data, data_type, - &fsid); + file_name, &fsid); ret = -ENOMEM; if (unlikely(!event)) { /* @@ -451,7 +569,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, ret = 0; } else if (fanotify_is_perm_event(mask)) { - ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event), + ret = fanotify_get_response(group, FANOTIFY_PERM(event), iter_info); } finish: @@ -470,22 +588,58 @@ static void fanotify_free_group_priv(struct fsnotify_group *group) free_uid(user); } +static void fanotify_free_path_event(struct fanotify_event *event) +{ + path_put(fanotify_event_path(event)); + kmem_cache_free(fanotify_path_event_cachep, FANOTIFY_PE(event)); +} + +static void fanotify_free_perm_event(struct fanotify_event *event) +{ + path_put(fanotify_event_path(event)); + kmem_cache_free(fanotify_perm_event_cachep, FANOTIFY_PERM(event)); +} + +static void fanotify_free_fid_event(struct fanotify_event *event) +{ + struct fanotify_fid_event *ffe = FANOTIFY_FE(event); + + if (fanotify_fh_has_ext_buf(&ffe->object_fh)) + kfree(fanotify_fh_ext_buf(&ffe->object_fh)); + kmem_cache_free(fanotify_fid_event_cachep, ffe); +} + +static void fanotify_free_name_event(struct fanotify_event *event) +{ + struct fanotify_name_event *fne = FANOTIFY_NE(event); + + if (fanotify_fh_has_ext_buf(&fne->dir_fh)) + kfree(fanotify_fh_ext_buf(&fne->dir_fh)); + kfree(fne); +} + static void fanotify_free_event(struct fsnotify_event *fsn_event) { struct fanotify_event *event; event = FANOTIFY_E(fsn_event); - if (fanotify_event_has_path(event)) - path_put(&event->path); - else if (fanotify_event_has_ext_fh(event)) - kfree(event->fid.ext_fh); put_pid(event->pid); - if (fanotify_is_perm_event(event->mask)) { - kmem_cache_free(fanotify_perm_event_cachep, - FANOTIFY_PE(fsn_event)); - return; + switch (event->type) { + case FANOTIFY_EVENT_TYPE_PATH: + fanotify_free_path_event(event); + break; + case FANOTIFY_EVENT_TYPE_PATH_PERM: + fanotify_free_perm_event(event); + break; + case FANOTIFY_EVENT_TYPE_FID: + fanotify_free_fid_event(event); + break; + case FANOTIFY_EVENT_TYPE_FID_NAME: + fanotify_free_name_event(event); + break; + default: + WARN_ON_ONCE(1); } - kmem_cache_free(fanotify_event_cachep, event); } static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 68b30504284c..35bfbf4a7aac 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -5,7 +5,8 @@ #include <linux/exportfs.h> extern struct kmem_cache *fanotify_mark_cache; -extern struct kmem_cache *fanotify_event_cachep; +extern struct kmem_cache *fanotify_fid_event_cachep; +extern struct kmem_cache *fanotify_path_event_cachep; extern struct kmem_cache *fanotify_perm_event_cachep; /* Possible states of the permission event */ @@ -18,94 +19,140 @@ enum { /* * 3 dwords are sufficient for most local fs (64bit ino, 32bit generation). - * For 32bit arch, fid increases the size of fanotify_event by 12 bytes and - * fh_* fields increase the size of fanotify_event by another 4 bytes. - * For 64bit arch, fid increases the size of fanotify_fid by 8 bytes and - * fh_* fields are packed in a hole after mask. + * fh buf should be dword aligned. On 64bit arch, the ext_buf pointer is + * stored in either the first or last 2 dwords. */ -#if BITS_PER_LONG == 32 #define FANOTIFY_INLINE_FH_LEN (3 << 2) -#else -#define FANOTIFY_INLINE_FH_LEN (4 << 2) -#endif -struct fanotify_fid { - __kernel_fsid_t fsid; - union { - unsigned char fh[FANOTIFY_INLINE_FH_LEN]; - unsigned char *ext_fh; - }; -}; +struct fanotify_fh { + unsigned char buf[FANOTIFY_INLINE_FH_LEN]; + u8 type; + u8 len; +} __aligned(4); + +static inline bool fanotify_fh_has_ext_buf(struct fanotify_fh *fh) +{ + return fh->len > FANOTIFY_INLINE_FH_LEN; +} + +static inline char **fanotify_fh_ext_buf_ptr(struct fanotify_fh *fh) +{ + BUILD_BUG_ON(__alignof__(char *) - 4 + sizeof(char *) > + FANOTIFY_INLINE_FH_LEN); + return (char **)ALIGN((unsigned long)(fh->buf), __alignof__(char *)); +} -static inline void *fanotify_fid_fh(struct fanotify_fid *fid, - unsigned int fh_len) +static inline void *fanotify_fh_ext_buf(struct fanotify_fh *fh) { - return fh_len <= FANOTIFY_INLINE_FH_LEN ? fid->fh : fid->ext_fh; + return *fanotify_fh_ext_buf_ptr(fh); } -static inline bool fanotify_fid_equal(struct fanotify_fid *fid1, - struct fanotify_fid *fid2, - unsigned int fh_len) +static inline void *fanotify_fh_buf(struct fanotify_fh *fh) { - return fid1->fsid.val[0] == fid2->fsid.val[0] && - fid1->fsid.val[1] == fid2->fsid.val[1] && - !memcmp(fanotify_fid_fh(fid1, fh_len), - fanotify_fid_fh(fid2, fh_len), fh_len); + return fanotify_fh_has_ext_buf(fh) ? fanotify_fh_ext_buf(fh) : fh->buf; } /* - * Structure for normal fanotify events. It gets allocated in + * Common structure for fanotify events. Concrete structs are allocated in * fanotify_handle_event() and freed when the information is retrieved by - * userspace + * userspace. The type of event determines how it was allocated, how it will + * be freed and which concrete struct it may be cast to. */ +enum fanotify_event_type { + FANOTIFY_EVENT_TYPE_FID, /* fixed length */ + FANOTIFY_EVENT_TYPE_FID_NAME, /* variable length */ + FANOTIFY_EVENT_TYPE_PATH, + FANOTIFY_EVENT_TYPE_PATH_PERM, +}; + struct fanotify_event { struct fsnotify_event fse; u32 mask; - /* - * Those fields are outside fanotify_fid to pack fanotify_event nicely - * on 64bit arch and to use fh_type as an indication of whether path - * or fid are used in the union: - * FILEID_ROOT (0) for path, > 0 for fid, FILEID_INVALID for neither. - */ - u8 fh_type; - u8 fh_len; - u16 pad; - union { - /* - * We hold ref to this path so it may be dereferenced at any - * point during this object's lifetime - */ - struct path path; - /* - * With FAN_REPORT_FID, we do not hold any reference on the - * victim object. Instead we store its NFS file handle and its - * filesystem's fsid as a unique identifier. - */ - struct fanotify_fid fid; - }; + enum fanotify_event_type type; struct pid *pid; }; -static inline bool fanotify_event_has_path(struct fanotify_event *event) +struct fanotify_fid_event { + struct fanotify_event fae; + __kernel_fsid_t fsid; + struct fanotify_fh object_fh; +}; + +static inline struct fanotify_fid_event * +FANOTIFY_FE(struct fanotify_event *event) { - return event->fh_type == FILEID_ROOT; + return container_of(event, struct fanotify_fid_event, fae); } -static inline bool fanotify_event_has_fid(struct fanotify_event *event) +struct fanotify_name_event { + struct fanotify_event fae; + __kernel_fsid_t fsid; + struct fanotify_fh dir_fh; + u8 name_len; + char name[0]; +}; + +static inline struct fanotify_name_event * +FANOTIFY_NE(struct fanotify_event *event) { - return event->fh_type != FILEID_ROOT && - event->fh_type != FILEID_INVALID; + return container_of(event, struct fanotify_name_event, fae); } -static inline bool fanotify_event_has_ext_fh(struct fanotify_event *event) +static inline __kernel_fsid_t *fanotify_event_fsid(struct fanotify_event *event) { - return fanotify_event_has_fid(event) && - event->fh_len > FANOTIFY_INLINE_FH_LEN; + if (event->type == FANOTIFY_EVENT_TYPE_FID) + return &FANOTIFY_FE(event)->fsid; + else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME) + return &FANOTIFY_NE(event)->fsid; + else + return NULL; } -static inline void *fanotify_event_fh(struct fanotify_event *event) +static inline struct fanotify_fh *fanotify_event_object_fh( + struct fanotify_event *event) { - return fanotify_fid_fh(&event->fid, event->fh_len); + if (event->type == FANOTIFY_EVENT_TYPE_FID) + return &FANOTIFY_FE(event)->object_fh; + else + return NULL; +} + +static inline struct fanotify_fh *fanotify_event_dir_fh( + struct fanotify_event *event) +{ + if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME) + return &FANOTIFY_NE(event)->dir_fh; + else + return NULL; +} + +static inline int fanotify_event_object_fh_len(struct fanotify_event *event) +{ + struct fanotify_fh *fh = fanotify_event_object_fh(event); + + return fh ? fh->len : 0; +} + +static inline bool fanotify_event_has_name(struct fanotify_event *event) +{ + return event->type == FANOTIFY_EVENT_TYPE_FID_NAME; +} + +static inline int fanotify_event_name_len(struct fanotify_event *event) +{ + return fanotify_event_has_name(event) ? + FANOTIFY_NE(event)->name_len : 0; +} + +struct fanotify_path_event { + struct fanotify_event fae; + struct path path; +}; + +static inline struct fanotify_path_event * +FANOTIFY_PE(struct fanotify_event *event) +{ + return container_of(event, struct fanotify_path_event, fae); } /* @@ -117,15 +164,16 @@ static inline void *fanotify_event_fh(struct fanotify_event *event) */ struct fanotify_perm_event { struct fanotify_event fae; + struct path path; unsigned short response; /* userspace answer to the event */ unsigned short state; /* state of the event */ int fd; /* fd we passed to userspace for this event */ }; static inline struct fanotify_perm_event * -FANOTIFY_PE(struct fsnotify_event *fse) +FANOTIFY_PERM(struct fanotify_event *event) { - return container_of(fse, struct fanotify_perm_event, fae.fse); + return container_of(event, struct fanotify_perm_event, fae); } static inline bool fanotify_is_perm_event(u32 mask) @@ -139,7 +187,24 @@ static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse) return container_of(fse, struct fanotify_event, fse); } +static inline bool fanotify_event_has_path(struct fanotify_event *event) +{ + return event->type == FANOTIFY_EVENT_TYPE_PATH || + event->type == FANOTIFY_EVENT_TYPE_PATH_PERM; +} + +static inline struct path *fanotify_event_path(struct fanotify_event *event) +{ + if (event->type == FANOTIFY_EVENT_TYPE_PATH) + return &FANOTIFY_PE(event)->path; + else if (event->type == FANOTIFY_EVENT_TYPE_PATH_PERM) + return &FANOTIFY_PERM(event)->path; + else + return NULL; +} + struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, struct inode *inode, u32 mask, const void *data, int data_type, + const struct qstr *file_name, __kernel_fsid_t *fsid); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 0aa362b88550..42cb794c62ac 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -46,32 +46,53 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops; struct kmem_cache *fanotify_mark_cache __read_mostly; -struct kmem_cache *fanotify_event_cachep __read_mostly; +struct kmem_cache *fanotify_fid_event_cachep __read_mostly; +struct kmem_cache *fanotify_path_event_cachep __read_mostly; struct kmem_cache *fanotify_perm_event_cachep __read_mostly; #define FANOTIFY_EVENT_ALIGN 4 +#define FANOTIFY_INFO_HDR_LEN \ + (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) + +static int fanotify_fid_info_len(int fh_len, int name_len) +{ + int info_len = fh_len; + + if (name_len) + info_len += name_len + 1; + + return roundup(FANOTIFY_INFO_HDR_LEN + info_len, FANOTIFY_EVENT_ALIGN); +} static int fanotify_event_info_len(struct fanotify_event *event) { - if (!fanotify_event_has_fid(event)) - return 0; + int info_len = 0; + int fh_len = fanotify_event_object_fh_len(event); + + if (fh_len) + info_len += fanotify_fid_info_len(fh_len, 0); - return roundup(sizeof(struct fanotify_event_info_fid) + - sizeof(struct file_handle) + event->fh_len, - FANOTIFY_EVENT_ALIGN); + if (fanotify_event_name_len(event)) { + struct fanotify_name_event *fne = FANOTIFY_NE(event); + + info_len += fanotify_fid_info_len(fne->dir_fh.len, + fne->name_len); + } + + return info_len; } /* - * Get an fsnotify notification event if one exists and is small + * Get an fanotify notification event if one exists and is small * enough to fit in "count". Return an error pointer if the count * is not large enough. When permission event is dequeued, its state is * updated accordingly. */ -static struct fsnotify_event *get_one_event(struct fsnotify_group *group, +static struct fanotify_event *get_one_event(struct fsnotify_group *group, size_t count) { size_t event_size = FAN_EVENT_METADATA_LEN; - struct fsnotify_event *fsn_event = NULL; + struct fanotify_event *event = NULL; pr_debug("%s: group=%p count=%zd\n", __func__, group, count); @@ -85,26 +106,23 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, } if (event_size > count) { - fsn_event = ERR_PTR(-EINVAL); + event = ERR_PTR(-EINVAL); goto out; } - fsn_event = fsnotify_remove_first_event(group); - if (fanotify_is_perm_event(FANOTIFY_E(fsn_event)->mask)) - FANOTIFY_PE(fsn_event)->state = FAN_EVENT_REPORTED; + event = FANOTIFY_E(fsnotify_remove_first_event(group)); + if (fanotify_is_perm_event(event->mask)) + FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED; out: spin_unlock(&group->notification_lock); - return fsn_event; + return event; } -static int create_fd(struct fsnotify_group *group, - struct fanotify_event *event, +static int create_fd(struct fsnotify_group *group, struct path *path, struct file **file) { int client_fd; struct file *new_file; - pr_debug("%s: group=%p event=%p\n", __func__, group, event); - client_fd = get_unused_fd_flags(group->fanotify_data.f_flags); if (client_fd < 0) return client_fd; @@ -113,14 +131,9 @@ static int create_fd(struct fsnotify_group *group, * we need a new file handle for the userspace program so it can read even if it was * originally opened O_WRONLY. */ - /* it's possible this event was an overflow event. in that case dentry and mnt - * are NULL; That's fine, just don't call dentry open */ - if (event->path.dentry && event->path.mnt) - new_file = dentry_open(&event->path, - group->fanotify_data.f_flags | FMODE_NONOTIFY, - current_cred()); - else - new_file = ERR_PTR(-EOVERFLOW); + new_file = dentry_open(path, + group->fanotify_data.f_flags | FMODE_NONOTIFY, + current_cred()); if (IS_ERR(new_file)) { /* * we still send an event even if we can't open the file. this @@ -204,83 +217,111 @@ static int process_access_response(struct fsnotify_group *group, return -ENOENT; } -static int copy_fid_to_user(struct fanotify_event *event, char __user *buf) +static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, + const char *name, size_t name_len, + char __user *buf, size_t count) { struct fanotify_event_info_fid info = { }; struct file_handle handle = { }; - unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh; - size_t fh_len = event->fh_len; - size_t len = fanotify_event_info_len(event); + unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf; + size_t fh_len = fh ? fh->len : 0; + size_t info_len = fanotify_fid_info_len(fh_len, name_len); + size_t len = info_len; + + pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n", + __func__, fh_len, name_len, info_len, count); - if (!len) + if (!fh_len || (name && !name_len)) return 0; - if (WARN_ON_ONCE(len < sizeof(info) + sizeof(handle) + fh_len)) + if (WARN_ON_ONCE(len < sizeof(info) || len > count)) return -EFAULT; - /* Copy event info fid header followed by vaiable sized file handle */ - info.hdr.info_type = FAN_EVENT_INFO_TYPE_FID; + /* + * Copy event info fid header followed by variable sized file handle + * and optionally followed by variable sized filename. + */ + info.hdr.info_type = name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : + FAN_EVENT_INFO_TYPE_FID; info.hdr.len = len; - info.fsid = event->fid.fsid; + info.fsid = *fsid; if (copy_to_user(buf, &info, sizeof(info))) return -EFAULT; buf += sizeof(info); len -= sizeof(info); - handle.handle_type = event->fh_type; + if (WARN_ON_ONCE(len < sizeof(handle))) + return -EFAULT; + + handle.handle_type = fh->type; handle.handle_bytes = fh_len; if (copy_to_user(buf, &handle, sizeof(handle))) return -EFAULT; buf += sizeof(handle); len -= sizeof(handle); + if (WARN_ON_ONCE(len < fh_len)) + return -EFAULT; + /* - * For an inline fh, copy through stack to exclude the copy from - * usercopy hardening protections. + * For an inline fh and inline file name, copy through stack to exclude + * the copy from usercopy hardening protections. */ - fh = fanotify_event_fh(event); + fh_buf = fanotify_fh_buf(fh); if (fh_len <= FANOTIFY_INLINE_FH_LEN) { - memcpy(bounce, fh, fh_len); - fh = bounce; + memcpy(bounce, fh_buf, fh_len); + fh_buf = bounce; } - if (copy_to_user(buf, fh, fh_len)) + if (copy_to_user(buf, fh_buf, fh_len)) return -EFAULT; - /* Pad with 0's */ buf += fh_len; len -= fh_len; + + if (name_len) { + /* Copy the filename with terminating null */ + name_len++; + if (WARN_ON_ONCE(len < name_len)) + return -EFAULT; + + if (copy_to_user(buf, name, name_len)) + return -EFAULT; + + buf += name_len; + len -= name_len; + } + + /* Pad with 0's */ WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN); if (len > 0 && clear_user(buf, len)) return -EFAULT; - return 0; + return info_len; } static ssize_t copy_event_to_user(struct fsnotify_group *group, - struct fsnotify_event *fsn_event, + struct fanotify_event *event, char __user *buf, size_t count) { struct fanotify_event_metadata metadata; - struct fanotify_event *event; + struct path *path = fanotify_event_path(event); struct file *f = NULL; int ret, fd = FAN_NOFD; - pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event); + pr_debug("%s: group=%p event=%p\n", __func__, group, event); - event = container_of(fsn_event, struct fanotify_event, fse); - metadata.event_len = FAN_EVENT_METADATA_LEN; + metadata.event_len = FAN_EVENT_METADATA_LEN + + fanotify_event_info_len(event); metadata.metadata_len = FAN_EVENT_METADATA_LEN; metadata.vers = FANOTIFY_METADATA_VERSION; metadata.reserved = 0; metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS; metadata.pid = pid_vnr(event->pid); - if (fanotify_event_has_path(event)) { - fd = create_fd(group, event, &f); + if (path && path->mnt && path->dentry) { + fd = create_fd(group, path, &f); if (fd < 0) return fd; - } else if (fanotify_event_has_fid(event)) { - metadata.event_len += fanotify_event_info_len(event); } metadata.fd = fd; @@ -295,15 +336,39 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN)) goto out_close_fd; + buf += FAN_EVENT_METADATA_LEN; + count -= FAN_EVENT_METADATA_LEN; + if (fanotify_is_perm_event(event->mask)) - FANOTIFY_PE(fsn_event)->fd = fd; + FANOTIFY_PERM(event)->fd = fd; - if (fanotify_event_has_path(event)) { + if (f) fd_install(fd, f); - } else if (fanotify_event_has_fid(event)) { - ret = copy_fid_to_user(event, buf + FAN_EVENT_METADATA_LEN); + + /* Event info records order is: dir fid + name, child fid */ + if (fanotify_event_name_len(event)) { + struct fanotify_name_event *fne = FANOTIFY_NE(event); + + ret = copy_info_to_user(fanotify_event_fsid(event), + fanotify_event_dir_fh(event), + fne->name, fne->name_len, + buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + } + + if (fanotify_event_object_fh_len(event)) { + ret = copy_info_to_user(fanotify_event_fsid(event), + fanotify_event_object_fh(event), + NULL, 0, buf, count); if (ret < 0) return ret; + + buf += ret; + count -= ret; } return metadata.event_len; @@ -335,7 +400,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { struct fsnotify_group *group; - struct fsnotify_event *kevent; + struct fanotify_event *event; char __user *start; int ret; DEFINE_WAIT_FUNC(wait, woken_wake_function); @@ -347,13 +412,13 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, add_wait_queue(&group->notification_waitq, &wait); while (1) { - kevent = get_one_event(group, count); - if (IS_ERR(kevent)) { - ret = PTR_ERR(kevent); + event = get_one_event(group, count); + if (IS_ERR(event)) { + ret = PTR_ERR(event); break; } - if (!kevent) { + if (!event) { ret = -EAGAIN; if (file->f_flags & O_NONBLOCK) break; @@ -369,7 +434,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, continue; } - ret = copy_event_to_user(group, kevent, buf, count); + ret = copy_event_to_user(group, event, buf, count); if (unlikely(ret == -EOPENSTALE)) { /* * We cannot report events with stale fd so drop it. @@ -384,17 +449,17 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, * Permission events get queued to wait for response. Other * events can be destroyed now. */ - if (!fanotify_is_perm_event(FANOTIFY_E(kevent)->mask)) { - fsnotify_destroy_event(group, kevent); + if (!fanotify_is_perm_event(event->mask)) { + fsnotify_destroy_event(group, &event->fse); } else { if (ret <= 0) { spin_lock(&group->notification_lock); finish_permission_event(group, - FANOTIFY_PE(kevent), FAN_DENY); + FANOTIFY_PERM(event), FAN_DENY); wake_up(&group->fanotify_data.access_waitq); } else { spin_lock(&group->notification_lock); - list_add_tail(&kevent->list, + list_add_tail(&event->fse.list, &group->fanotify_data.access_list); spin_unlock(&group->notification_lock); } @@ -440,8 +505,6 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t static int fanotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; - struct fanotify_perm_event *event; - struct fsnotify_event *fsn_event; /* * Stop new events from arriving in the notification queue. since @@ -456,6 +519,8 @@ static int fanotify_release(struct inode *ignored, struct file *file) */ spin_lock(&group->notification_lock); while (!list_empty(&group->fanotify_data.access_list)) { + struct fanotify_perm_event *event; + event = list_first_entry(&group->fanotify_data.access_list, struct fanotify_perm_event, fae.fse.list); list_del_init(&event->fae.fse.list); @@ -469,12 +534,14 @@ static int fanotify_release(struct inode *ignored, struct file *file) * response is consumed and fanotify_get_response() returns. */ while (!fsnotify_notify_queue_is_empty(group)) { - fsn_event = fsnotify_remove_first_event(group); - if (!(FANOTIFY_E(fsn_event)->mask & FANOTIFY_PERM_EVENTS)) { + struct fanotify_event *event; + + event = FANOTIFY_E(fsnotify_remove_first_event(group)); + if (!(event->mask & FANOTIFY_PERM_EVENTS)) { spin_unlock(&group->notification_lock); - fsnotify_destroy_event(group, fsn_event); + fsnotify_destroy_event(group, &event->fse); } else { - finish_permission_event(group, FANOTIFY_PE(fsn_event), + finish_permission_event(group, FANOTIFY_PERM(event), FAN_ALLOW); } spin_lock(&group->notification_lock); @@ -824,7 +891,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->memcg = get_mem_cgroup_from_mm(current->mm); oevent = fanotify_alloc_event(group, NULL, FS_Q_OVERFLOW, NULL, - FSNOTIFY_EVENT_NONE, NULL); + FSNOTIFY_EVENT_NONE, NULL, NULL); if (unlikely(!oevent)) { fd = -ENOMEM; goto out_destroy_group; @@ -1139,7 +1206,10 @@ static int __init fanotify_user_setup(void) fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC|SLAB_ACCOUNT); - fanotify_event_cachep = KMEM_CACHE(fanotify_event, SLAB_PANIC); + fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event, + SLAB_PANIC); + fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event, + SLAB_PANIC); if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) { fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event, SLAB_PANIC); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 46f225580009..72d332ce8e12 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -143,15 +143,13 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) } /* Notify this dentry's parent about a child's events. */ -int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask) +int fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, + int data_type) { struct dentry *parent; struct inode *p_inode; int ret = 0; - if (!dentry) - dentry = path->dentry; - if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) return 0; @@ -168,12 +166,7 @@ int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask mask |= FS_EVENT_ON_CHILD; take_dentry_name_snapshot(&name, dentry); - if (path) - ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH, - &name.name, 0); - else - ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, - &name.name, 0); + ret = fsnotify(p_inode, mask, data, data_type, &name.name, 0); release_dentry_name_snapshot(&name); } @@ -181,7 +174,7 @@ int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask return ret; } -EXPORT_SYMBOL_GPL(__fsnotify_parent); +EXPORT_SYMBOL_GPL(fsnotify_parent); static int send_to_group(struct inode *to_tell, __u32 mask, const void *data, @@ -318,6 +311,7 @@ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info) int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, const struct qstr *file_name, u32 cookie) { + const struct path *path = fsnotify_data_path(data, data_is); struct fsnotify_iter_info iter_info = {}; struct super_block *sb = to_tell->i_sb; struct mount *mnt = NULL; @@ -325,8 +319,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, int ret = 0; __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); - if (data_is == FSNOTIFY_EVENT_PATH) { - mnt = real_mount(((const struct path *)data)->mnt); + if (path) { + mnt = real_mount(path->mnt); mnt_or_sb_mask |= mnt->mnt_fsnotify_mask; } /* An event "on child" is not intended for a mount/sb mark */ @@ -389,7 +383,7 @@ static __init int fsnotify_init(void) { int ret; - BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 25); + BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index d510223d302c..2ebc89047153 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -39,7 +39,7 @@ static bool event_compare(struct fsnotify_event *old_fsn, if (old->mask & FS_IN_IGNORED) return false; if ((old->mask == new->mask) && - (old_fsn->inode == new_fsn->inode) && + (old_fsn->objectid == new_fsn->objectid) && (old->name_len == new->name_len) && (!old->name_len || !strcmp(old->name, new->name))) return true; @@ -61,6 +61,7 @@ int inotify_handle_event(struct fsnotify_group *group, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { + const struct path *path = fsnotify_data_path(data, data_type); struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct inotify_inode_mark *i_mark; struct inotify_event_info *event; @@ -73,12 +74,9 @@ int inotify_handle_event(struct fsnotify_group *group, return 0; if ((inode_mark->mask & FS_EXCL_UNLINK) && - (data_type == FSNOTIFY_EVENT_PATH)) { - const struct path *path = data; + path && d_unlinked(path->dentry)) + return 0; - if (d_unlinked(path->dentry)) - return 0; - } if (file_name) { len = file_name->len; alloc_len += len + 1; @@ -118,7 +116,7 @@ int inotify_handle_event(struct fsnotify_group *group, mask &= ~IN_ISDIR; fsn_event = &event->fse; - fsnotify_init_event(fsn_event, inode); + fsnotify_init_event(fsn_event, (unsigned long)inode); event->mask = mask; event->wd = i_mark->wd; event->sync_cookie = cookie; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 107537a543fd..81ffc8629fc4 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -635,7 +635,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) return ERR_PTR(-ENOMEM); } group->overflow_event = &oevent->fse; - fsnotify_init_event(group->overflow_event, NULL); + fsnotify_init_event(group->overflow_event, 0); oevent->mask = FS_Q_OVERFLOW; oevent->wd = -1; oevent->sync_cookie = 0; diff --git a/fs/nsfs.c b/fs/nsfs.c index b13bfd406820..4f1205725cfe 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -247,6 +247,20 @@ out_invalid: return ERR_PTR(-EINVAL); } +/** + * ns_match() - Returns true if current namespace matches dev/ino provided. + * @ns_common: current ns + * @dev: dev_t from nsfs that will be matched against current nsfs + * @ino: ino_t from nsfs that will be matched against current nsfs + * + * Return: true if dev and ino matches the current nsfs. + */ +bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino) +{ + return (ns->inum == ino) && (nsfs_mnt->mnt_sb->s_dev == dev); +} + + static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry) { struct inode *inode = d_inode(dentry); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 88534eb0e7c2..2f834add165b 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1060,7 +1060,6 @@ bail: brelse(bhs[i]); bhs[i] = NULL; } - mlog_errno(status); } return status; } @@ -3942,7 +3941,7 @@ rotate: * above. * * This leaf needs to have space, either by the empty 1st - * extent record, or by virtue of an l_next_rec < l_count. + * extent record, or by virtue of an l_next_free_rec < l_count. */ ocfs2_rotate_leaf(el, insert_rec); } @@ -7403,6 +7402,10 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_inline_data *idata = &di->id2.i_data; + /* No need to punch hole beyond i_size. */ + if (start >= i_size_read(inode)) + return 0; + if (end > i_size_read(inode)) end = i_size_read(inode); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index a368350d4c27..89d13e0705fe 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -101,8 +101,6 @@ static struct o2hb_callback { static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type); -#define O2HB_DEFAULT_BLOCK_BITS 9 - enum o2hb_heartbeat_modes { O2HB_HEARTBEAT_LOCAL = 0, O2HB_HEARTBEAT_GLOBAL, @@ -1309,7 +1307,7 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) case O2HB_DB_TYPE_REGION_NUMBER: reg = (struct o2hb_region *)db->db_data; - out += snprintf(buf + out, PAGE_SIZE - out, "%d\n", + out += scnprintf(buf + out, PAGE_SIZE - out, "%d\n", reg->hr_region_num); goto done; @@ -1319,12 +1317,12 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) /* If 0, it has never been set before */ if (lts) lts = jiffies_to_msecs(jiffies - lts); - out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts); + out += scnprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts); goto done; case O2HB_DB_TYPE_REGION_PINNED: reg = (struct o2hb_region *)db->db_data; - out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", + out += scnprintf(buf + out, PAGE_SIZE - out, "%u\n", !!reg->hr_item_pinned); goto done; @@ -1333,8 +1331,8 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) } while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len) - out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); - out += snprintf(buf + out, PAGE_SIZE - out, "\n"); + out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i); + out += scnprintf(buf + out, PAGE_SIZE - out, "\n"); done: i_size_write(inode, out); diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 02bf4a1774cc..667a5c5e1f66 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -443,8 +443,8 @@ static int o2net_fill_bitmap(char *buf, int len) o2net_fill_node_map(map, sizeof(map)); while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) - out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); - out += snprintf(buf + out, PAGE_SIZE - out, "\n"); + out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i); + out += scnprintf(buf + out, PAGE_SIZE - out, "\n"); return out; } diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 48a3398f0bf5..2c512b40a940 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1570,15 +1570,13 @@ static void o2net_start_connect(struct work_struct *work) struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; int ret = 0, stop; unsigned int timeout; - unsigned int noio_flag; + unsigned int nofs_flag; /* - * sock_create allocates the sock with GFP_KERNEL. We must set - * per-process flag PF_MEMALLOC_NOIO so that all allocations done - * by this process are done as if GFP_NOIO was specified. So we - * are not reentering filesystem while doing memory reclaim. + * sock_create allocates the sock with GFP_KERNEL. We must + * prevent the filesystem from being reentered by memory reclaim. */ - noio_flag = memalloc_noio_save(); + nofs_flag = memalloc_nofs_save(); /* if we're greater we initiate tx, otherwise we accept */ if (o2nm_this_node() <= o2net_num_from_nn(nn)) goto out; @@ -1683,7 +1681,7 @@ out: if (mynode) o2nm_node_put(mynode); - memalloc_noio_restore(noio_flag); + memalloc_nofs_restore(nofs_flag); return; } @@ -1810,15 +1808,13 @@ static int o2net_accept_one(struct socket *sock, int *more) struct o2nm_node *local_node = NULL; struct o2net_sock_container *sc = NULL; struct o2net_node *nn; - unsigned int noio_flag; + unsigned int nofs_flag; /* - * sock_create_lite allocates the sock with GFP_KERNEL. We must set - * per-process flag PF_MEMALLOC_NOIO so that all allocations done - * by this process are done as if GFP_NOIO was specified. So we - * are not reentering filesystem while doing memory reclaim. + * sock_create_lite allocates the sock with GFP_KERNEL. We must + * prevent the filesystem from being reentered by memory reclaim. */ - noio_flag = memalloc_noio_save(); + nofs_flag = memalloc_nofs_save(); BUG_ON(sock == NULL); *more = 0; @@ -1934,7 +1930,7 @@ out: if (sc) sc_put(sc); - memalloc_noio_restore(noio_flag); + memalloc_nofs_restore(nofs_flag); return ret; } @@ -1948,7 +1944,6 @@ static void o2net_accept_many(struct work_struct *work) { struct socket *sock = o2net_listen_sock; int more; - int err; /* * It is critical to note that due to interrupt moderation @@ -1963,7 +1958,7 @@ static void o2net_accept_many(struct work_struct *work) */ for (;;) { - err = o2net_accept_one(sock, &more); + o2net_accept_one(sock, &more); if (!more) break; cond_resched(); diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index de87cbffd175..736338f45c59 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -32,7 +32,7 @@ struct o2net_msg __be32 status; __be32 key; __be32 msg_num; - __u8 buf[0]; + __u8 buf[]; }; typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data, diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index bdef72c0f099..5761060d2ba8 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -676,7 +676,7 @@ static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, int ra_ptr = 0; /* Current index into readahead buffer */ int num = 0; - int nblocks, i, err; + int nblocks, i; sb = dir->i_sb; @@ -708,7 +708,7 @@ restart: num++; bh = NULL; - err = ocfs2_read_dir_block(dir, b++, &bh, + ocfs2_read_dir_block(dir, b++, &bh, OCFS2_BH_READAHEAD); bh_use[ra_max] = bh; } diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 0463dce65bb2..c8a444622faa 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -564,7 +564,7 @@ struct dlm_migratable_lockres // 48 bytes u8 lvb[DLM_LVB_LEN]; // 112 bytes - struct dlm_migratable_lock ml[0]; // 16 bytes each, begins at byte 112 + struct dlm_migratable_lock ml[]; // 16 bytes each, begins at byte 112 }; #define DLM_MIG_LOCKRES_MAX_LEN \ (sizeof(struct dlm_migratable_lockres) + \ @@ -601,7 +601,7 @@ struct dlm_convert_lock u8 name[O2NM_MAX_NAME_LEN]; - s8 lvb[0]; + s8 lvb[]; }; #define DLM_CONVERT_LOCK_MAX_LEN (sizeof(struct dlm_convert_lock)+DLM_LVB_LEN) @@ -616,7 +616,7 @@ struct dlm_unlock_lock u8 name[O2NM_MAX_NAME_LEN]; - s8 lvb[0]; + s8 lvb[]; }; #define DLM_UNLOCK_LOCK_MAX_LEN (sizeof(struct dlm_unlock_lock)+DLM_LVB_LEN) @@ -632,7 +632,7 @@ struct dlm_proxy_ast u8 name[O2NM_MAX_NAME_LEN]; - s8 lvb[0]; + s8 lvb[]; }; #define DLM_PROXY_AST_MAX_LEN (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN) diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index c5c6efba7b5e..4b8b41d23e91 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -244,11 +244,11 @@ static int stringify_lockname(const char *lockname, int locklen, char *buf, memcpy((__be64 *)&inode_blkno_be, (char *)&lockname[OCFS2_DENTRY_LOCK_INO_START], sizeof(__be64)); - out += snprintf(buf + out, len - out, "%.*s%08x", + out += scnprintf(buf + out, len - out, "%.*s%08x", OCFS2_DENTRY_LOCK_INO_START - 1, lockname, (unsigned int)be64_to_cpu(inode_blkno_be)); } else - out += snprintf(buf + out, len - out, "%.*s", + out += scnprintf(buf + out, len - out, "%.*s", locklen, lockname); return out; } @@ -260,7 +260,7 @@ static int stringify_nodemap(unsigned long *nodemap, int maxnodes, int i = -1; while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes) - out += snprintf(buf + out, len - out, "%d ", i); + out += scnprintf(buf + out, len - out, "%d ", i); return out; } @@ -278,34 +278,34 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len) mle_type = "MIG"; out += stringify_lockname(mle->mname, mle->mnamelen, buf + out, len - out); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n", mle_type, mle->master, mle->new_master, !list_empty(&mle->hb_events), !!mle->inuse, kref_read(&mle->mle_refs)); - out += snprintf(buf + out, len - out, "Maybe="); + out += scnprintf(buf + out, len - out, "Maybe="); out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); - out += snprintf(buf + out, len - out, "Vote="); + out += scnprintf(buf + out, len - out, "Vote="); out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); - out += snprintf(buf + out, len - out, "Response="); + out += scnprintf(buf + out, len - out, "Response="); out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); - out += snprintf(buf + out, len - out, "Node="); + out += scnprintf(buf + out, len - out, "Node="); out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); return out; } @@ -353,7 +353,7 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len) int out = 0; unsigned long total = 0; - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Dumping Purgelist for Domain: %s\n", dlm->name); spin_lock(&dlm->spinlock); @@ -365,13 +365,13 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len) out += stringify_lockname(res->lockname.name, res->lockname.len, buf + out, len - out); - out += snprintf(buf + out, len - out, "\t%ld\n", + out += scnprintf(buf + out, len - out, "\t%ld\n", (jiffies - res->last_used)/HZ); spin_unlock(&res->spinlock); } spin_unlock(&dlm->spinlock); - out += snprintf(buf + out, len - out, "Total on list: %lu\n", total); + out += scnprintf(buf + out, len - out, "Total on list: %lu\n", total); return out; } @@ -410,7 +410,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len) int i, out = 0; unsigned long total = 0, longest = 0, bucket_count = 0; - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Dumping MLEs for Domain: %s\n", dlm->name); spin_lock(&dlm->master_lock); @@ -428,7 +428,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len) } spin_unlock(&dlm->master_lock); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Total: %lu, Longest: %lu\n", total, longest); return out; } @@ -467,7 +467,7 @@ static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len) #define DEBUG_LOCK_VERSION 1 spin_lock(&lock->spinlock); - out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d," + out = scnprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d," "%d,%d,%d,%d\n", DEBUG_LOCK_VERSION, list_type, lock->ml.type, lock->ml.convert_type, @@ -491,13 +491,13 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len) int i; int out = 0; - out += snprintf(buf + out, len - out, "NAME:"); + out += scnprintf(buf + out, len - out, "NAME:"); out += stringify_lockname(res->lockname.name, res->lockname.len, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); #define DEBUG_LRES_VERSION 1 - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n", DEBUG_LRES_VERSION, res->owner, res->state, res->last_used, @@ -509,17 +509,17 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len) kref_read(&res->refs)); /* refmap */ - out += snprintf(buf + out, len - out, "RMAP:"); + out += scnprintf(buf + out, len - out, "RMAP:"); out += stringify_nodemap(res->refmap, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); /* lvb */ - out += snprintf(buf + out, len - out, "LVBX:"); + out += scnprintf(buf + out, len - out, "LVBX:"); for (i = 0; i < DLM_LVB_LEN; i++) - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%02x", (unsigned char)res->lvb[i]); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); /* granted */ list_for_each_entry(lock, &res->granted, list) @@ -533,7 +533,7 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len) list_for_each_entry(lock, &res->blocked, list) out += dump_lock(lock, 2, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); return out; } @@ -683,41 +683,41 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) } /* Domain: xxxxxxxxxx Key: 0xdfbac769 */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Domain: %s Key: 0x%08x Protocol: %d.%d\n", dlm->name, dlm->key, dlm->dlm_locking_proto.pv_major, dlm->dlm_locking_proto.pv_minor); /* Thread Pid: xxx Node: xxx State: xxxxx */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Thread Pid: %d Node: %d State: %s\n", task_pid_nr(dlm->dlm_thread_task), dlm->node_num, state); /* Number of Joins: xxx Joining Node: xxx */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Number of Joins: %d Joining Node: %d\n", dlm->num_joins, dlm->joining_node); /* Domain Map: xx xx xx */ - out += snprintf(buf + out, len - out, "Domain Map: "); + out += scnprintf(buf + out, len - out, "Domain Map: "); out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); /* Exit Domain Map: xx xx xx */ - out += snprintf(buf + out, len - out, "Exit Domain Map: "); + out += scnprintf(buf + out, len - out, "Exit Domain Map: "); out += stringify_nodemap(dlm->exit_domain_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); /* Live Map: xx xx xx */ - out += snprintf(buf + out, len - out, "Live Map: "); + out += scnprintf(buf + out, len - out, "Live Map: "); out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); /* Lock Resources: xxx (xxx) */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Lock Resources: %d (%d)\n", atomic_read(&dlm->res_cur_count), atomic_read(&dlm->res_tot_count)); @@ -729,29 +729,29 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) cur_mles += atomic_read(&dlm->mle_cur_count[i]); /* MLEs: xxx (xxx) */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "MLEs: %d (%d)\n", cur_mles, tot_mles); /* Blocking: xxx (xxx) */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, " Blocking: %d (%d)\n", atomic_read(&dlm->mle_cur_count[DLM_MLE_BLOCK]), atomic_read(&dlm->mle_tot_count[DLM_MLE_BLOCK])); /* Mastery: xxx (xxx) */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, " Mastery: %d (%d)\n", atomic_read(&dlm->mle_cur_count[DLM_MLE_MASTER]), atomic_read(&dlm->mle_tot_count[DLM_MLE_MASTER])); /* Migration: xxx (xxx) */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, " Migration: %d (%d)\n", atomic_read(&dlm->mle_cur_count[DLM_MLE_MIGRATION]), atomic_read(&dlm->mle_tot_count[DLM_MLE_MIGRATION])); /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Lists: Dirty=%s Purge=%s PendingASTs=%s " "PendingBASTs=%s\n", (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"), @@ -760,12 +760,12 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) (list_empty(&dlm->pending_basts) ? "Empty" : "InUse")); /* Purge Count: xxx Refs: xxx */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Purge Count: %d Refs: %d\n", dlm->purge_count, kref_read(&dlm->dlm_refs)); /* Dead Node: xxx */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Dead Node: %d\n", dlm->reco.dead_node); /* What about DLM_RECO_STATE_FINALIZE? */ @@ -775,19 +775,19 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) state = "INACTIVE"; /* Recovery Pid: xxxx Master: xxx State: xxxx */ - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "Recovery Pid: %d Master: %d State: %s\n", task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master, state); /* Recovery Map: xx xx */ - out += snprintf(buf + out, len - out, "Recovery Map: "); + out += scnprintf(buf + out, len - out, "Recovery Map: "); out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES, buf + out, len - out); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); /* Recovery Node State: */ - out += snprintf(buf + out, len - out, "Recovery Node State:\n"); + out += scnprintf(buf + out, len - out, "Recovery Node State:\n"); list_for_each_entry(node, &dlm->reco.node_data, list) { switch (node->state) { case DLM_RECO_NODE_DATA_INIT: @@ -815,7 +815,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len) state = "BAD"; break; } - out += snprintf(buf + out, len - out, "\t%u - %s\n", + out += scnprintf(buf + out, len - out, "\t%u - %s\n", node->node_num, state); } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 900f7e466d11..55a6512e9fde 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2749,8 +2749,6 @@ leave: return ret; } -#define DLM_MIGRATION_RETRY_MS 100 - /* * Should be called only after beginning the domain leave process. * There should not be any remaining locks on nonlocal lock resources, diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index fd40c17cd022..5ccc4ff0b82a 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -39,8 +39,6 @@ static int dlm_thread(void *data); static void dlm_flush_asts(struct dlm_ctxt *dlm); -#define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num) - /* will exit holding res->spinlock, but may drop in function */ /* waits until flags are cleared on res->state */ void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags) @@ -680,7 +678,6 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm) #define DLM_THREAD_TIMEOUT_MS (4 * 1000) #define DLM_THREAD_MAX_DIRTY 100 -#define DLM_THREAD_MAX_ASTS 10 static int dlm_thread(void *data) { diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index cb9e6a73bea9..152a0fc4e905 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2133,7 +2133,7 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, } #define OCFS2_SEC_BITS 34 -#define OCFS2_SEC_SHIFT (64 - 34) +#define OCFS2_SEC_SHIFT (64 - OCFS2_SEC_BITS) #define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1) /* LVB only has room for 64 bits of time here so we pack it for diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 68ba354cf361..b425f0b01dce 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -91,7 +91,7 @@ enum ocfs2_replay_state { struct ocfs2_replay_map { unsigned int rm_slots; enum ocfs2_replay_state rm_state; - unsigned char rm_replay_slots[0]; + unsigned char rm_replay_slots[]; }; static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index da65251ef815..5381020aaa9a 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -406,7 +406,7 @@ static int ocfs2_mknod(struct inode *dir, if (status < 0) { mlog_errno(status); - goto leave; + goto roll_back; } if (si.enable) { @@ -414,7 +414,7 @@ static int ocfs2_mknod(struct inode *dir, meta_ac, data_ac); if (status < 0) { mlog_errno(status); - goto leave; + goto roll_back; } } @@ -427,7 +427,7 @@ static int ocfs2_mknod(struct inode *dir, OCFS2_I(dir)->ip_blkno); if (status) { mlog_errno(status); - goto leave; + goto roll_back; } dl = dentry->d_fsdata; @@ -437,12 +437,19 @@ static int ocfs2_mknod(struct inode *dir, &lookup); if (status < 0) { mlog_errno(status); - goto leave; + goto roll_back; } insert_inode_hash(inode); d_instantiate(dentry, inode); status = 0; + +roll_back: + if (status < 0 && S_ISDIR(mode)) { + ocfs2_add_links_count(dirfe, -1); + drop_nlink(dir); + } + leave: if (status < 0 && did_quota_inode) dquot_free_inode(inode); diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 0db4a7ec58a2..0dd8c41bafd4 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -470,7 +470,7 @@ struct ocfs2_extent_list { __le16 l_reserved1; __le64 l_reserved2; /* Pad to sizeof(ocfs2_extent_rec) */ -/*10*/ struct ocfs2_extent_rec l_recs[0]; /* Extent records */ +/*10*/ struct ocfs2_extent_rec l_recs[]; /* Extent records */ }; /* @@ -484,7 +484,7 @@ struct ocfs2_chain_list { __le16 cl_count; /* Total chains in this list */ __le16 cl_next_free_rec; /* Next unused chain slot */ __le64 cl_reserved1; -/*10*/ struct ocfs2_chain_rec cl_recs[0]; /* Chain records */ +/*10*/ struct ocfs2_chain_rec cl_recs[]; /* Chain records */ }; /* @@ -496,7 +496,7 @@ struct ocfs2_truncate_log { /*00*/ __le16 tl_count; /* Total records in this log */ __le16 tl_used; /* Number of records in use */ __le32 tl_reserved1; -/*08*/ struct ocfs2_truncate_rec tl_recs[0]; /* Truncate records */ +/*08*/ struct ocfs2_truncate_rec tl_recs[]; /* Truncate records */ }; /* @@ -640,7 +640,7 @@ struct ocfs2_local_alloc __le16 la_size; /* Size of included bitmap, in bytes */ __le16 la_reserved1; __le64 la_reserved2; -/*10*/ __u8 la_bitmap[0]; +/*10*/ __u8 la_bitmap[]; }; /* @@ -653,7 +653,7 @@ struct ocfs2_inline_data * for data, starting at id_data */ __le16 id_reserved0; __le32 id_reserved1; - __u8 id_data[0]; /* Start of user data */ + __u8 id_data[]; /* Start of user data */ }; /* @@ -798,7 +798,7 @@ struct ocfs2_dx_entry_list { * possible in de_entries */ __le16 de_num_used; /* Current number of * de_entries entries */ - struct ocfs2_dx_entry de_entries[0]; /* Indexed dir entries + struct ocfs2_dx_entry de_entries[]; /* Indexed dir entries * in a packed array of * length de_num_used */ }; @@ -935,7 +935,7 @@ struct ocfs2_refcount_list { __le16 rl_used; /* Current number of used records */ __le32 rl_reserved2; __le64 rl_reserved1; /* Pad to sizeof(ocfs2_refcount_record) */ -/*10*/ struct ocfs2_refcount_rec rl_recs[0]; /* Refcount records */ +/*10*/ struct ocfs2_refcount_rec rl_recs[]; /* Refcount records */ }; @@ -1021,7 +1021,7 @@ struct ocfs2_xattr_header { buckets. A block uses xb_check and sets this field to zero.) */ - struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */ + struct ocfs2_xattr_entry xh_entries[]; /* xattr entry list. */ }; /* @@ -1207,7 +1207,7 @@ struct ocfs2_local_disk_dqinfo { /* Header of one chunk of a quota file */ struct ocfs2_local_disk_chunk { __le32 dqc_free; /* Number of free entries in the bitmap */ - __u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding + __u8 dqc_bitmap[]; /* Bitmap of entries in the corresponding * chunk of quota file */ }; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index ee43e51188be..cfb77f70c888 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -154,6 +154,7 @@ ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci) } static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci) +__acquires(&rf->rf_lock) { struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); @@ -161,6 +162,7 @@ static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci) } static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci) +__releases(&rf->rf_lock) { struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index 0249e8ca1028..bf3842e34fb9 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -33,9 +33,6 @@ static DEFINE_SPINLOCK(resv_lock); -#define OCFS2_MIN_RESV_WINDOW_BITS 8 -#define OCFS2_MAX_RESV_WINDOW_BITS 1024 - int ocfs2_dir_resv_allowed(struct ocfs2_super *osb) { return (osb->osb_resv_level && osb->osb_dir_resv_level); diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 8aa6a667860c..a191094694c6 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -656,8 +656,6 @@ error: * and easier to preserve the name. */ -#define FS_OCFS2_NM 1 - static struct ctl_table ocfs2_nm_table[] = { { .procname = "hb_ctl_path", diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 939df99d2dec..4836becb7578 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2509,9 +2509,6 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, bail: brelse(group_bh); - - if (status) - mlog_errno(status); return status; } @@ -2582,8 +2579,6 @@ static int _ocfs2_free_clusters(handle_t *handle, num_clusters); out: - if (status) - mlog_errno(status); return status; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 05dd68ade293..ac61eeaf3837 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -220,31 +220,31 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) int i, out = 0; unsigned long flags; - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Id: %-s Uuid: %-s Gen: 0x%X Label: %-s\n", "Device", osb->dev_str, osb->uuid_str, osb->fs_generation, osb->vol_label); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => State: %d Flags: 0x%lX\n", "Volume", atomic_read(&osb->vol_state), osb->osb_flags); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Block: %lu Cluster: %d\n", "Sizes", osb->sb->s_blocksize, osb->s_clustersize); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Compat: 0x%X Incompat: 0x%X " "ROcompat: 0x%X\n", "Features", osb->s_feature_compat, osb->s_feature_incompat, osb->s_feature_ro_compat); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Opts: 0x%lX AtimeQuanta: %u\n", "Mount", osb->s_mount_opt, osb->s_atime_quantum); if (cconn) { - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Stack: %s Name: %*s " "Version: %d.%d\n", "Cluster", (*osb->osb_cluster_stack == '\0' ? @@ -255,7 +255,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) } spin_lock_irqsave(&osb->dc_task_lock, flags); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Pid: %d Count: %lu WakeSeq: %lu " "WorkSeq: %lu\n", "DownCnvt", (osb->dc_task ? task_pid_nr(osb->dc_task) : -1), @@ -264,32 +264,32 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) spin_unlock_irqrestore(&osb->dc_task_lock, flags); spin_lock(&osb->osb_lock); - out += snprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", + out += scnprintf(buf + out, len - out, "%10s => Pid: %d Nodes:", "Recovery", (osb->recovery_thread_task ? task_pid_nr(osb->recovery_thread_task) : -1)); if (rm->rm_used == 0) - out += snprintf(buf + out, len - out, " None\n"); + out += scnprintf(buf + out, len - out, " None\n"); else { for (i = 0; i < rm->rm_used; i++) - out += snprintf(buf + out, len - out, " %d", + out += scnprintf(buf + out, len - out, " %d", rm->rm_entries[i]); - out += snprintf(buf + out, len - out, "\n"); + out += scnprintf(buf + out, len - out, "\n"); } spin_unlock(&osb->osb_lock); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => Pid: %d Interval: %lu\n", "Commit", (osb->commit_task ? task_pid_nr(osb->commit_task) : -1), osb->osb_commit_interval); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => State: %d TxnId: %lu NumTxns: %d\n", "Journal", osb->journal->j_state, osb->journal->j_trans_id, atomic_read(&osb->journal->j_num_trans)); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => GlobalAllocs: %d LocalAllocs: %d " "SubAllocs: %d LAWinMoves: %d SAExtends: %d\n", "Stats", @@ -299,7 +299,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) atomic_read(&osb->alloc_stats.moves), atomic_read(&osb->alloc_stats.bg_extends)); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => State: %u Descriptor: %llu Size: %u bits " "Default: %u bits\n", "LocalAlloc", osb->local_alloc_state, @@ -307,7 +307,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) osb->local_alloc_bits, osb->local_alloc_default_bits); spin_lock(&osb->osb_lock); - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s => InodeSlot: %d StolenInodes: %d, " "MetaSlot: %d StolenMeta: %d\n", "Steal", osb->s_inode_steal_slot, @@ -316,20 +316,20 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) atomic_read(&osb->s_num_meta_stolen)); spin_unlock(&osb->osb_lock); - out += snprintf(buf + out, len - out, "OrphanScan => "); - out += snprintf(buf + out, len - out, "Local: %u Global: %u ", + out += scnprintf(buf + out, len - out, "OrphanScan => "); + out += scnprintf(buf + out, len - out, "Local: %u Global: %u ", os->os_count, os->os_seqno); - out += snprintf(buf + out, len - out, " Last Scan: "); + out += scnprintf(buf + out, len - out, " Last Scan: "); if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) - out += snprintf(buf + out, len - out, "Disabled\n"); + out += scnprintf(buf + out, len - out, "Disabled\n"); else - out += snprintf(buf + out, len - out, "%lu seconds ago\n", + out += scnprintf(buf + out, len - out, "%lu seconds ago\n", (unsigned long)(ktime_get_seconds() - os->os_scantime)); - out += snprintf(buf + out, len - out, "%10s => %3s %10s\n", + out += scnprintf(buf + out, len - out, "%10s => %3s %10s\n", "Slots", "Num", "RecoGen"); for (i = 0; i < osb->max_slots; ++i) { - out += snprintf(buf + out, len - out, + out += scnprintf(buf + out, len - out, "%10s %c %3d %10d\n", " ", (i == osb->slot_num ? '*' : ' '), diff --git a/fs/open.c b/fs/open.c index b69d6eed67e6..719b320ede52 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1046,8 +1046,10 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op) if (flags & O_CREAT) { op->intent |= LOOKUP_CREATE; - if (flags & O_EXCL) + if (flags & O_EXCL) { op->intent |= LOOKUP_EXCL; + flags |= O_NOFOLLOW; + } } if (flags & O_DIRECTORY) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index c740159d9ad1..af375e049aae 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -346,23 +346,8 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { int ret; - struct orangefs_read_options *ro; - orangefs_stats.reads++; - /* - * Remember how they set "count" in read(2) or pread(2) or whatever - - * users can use count as a knob to control orangefs io size and later - * we can try to help them fill as many pages as possible in readpage. - */ - if (!iocb->ki_filp->private_data) { - iocb->ki_filp->private_data = kmalloc(sizeof *ro, GFP_KERNEL); - if (!iocb->ki_filp->private_data) - return(ENOMEM); - ro = iocb->ki_filp->private_data; - ro->blksiz = iter->count; - } - down_read(&file_inode(iocb->ki_filp)->i_rwsem); ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); if (ret) @@ -650,12 +635,6 @@ static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl) return rc; } -static int orangefs_file_open(struct inode * inode, struct file *file) -{ - file->private_data = NULL; - return generic_file_open(inode, file); -} - static int orangefs_flush(struct file *file, fl_owner_t id) { /* @@ -666,19 +645,8 @@ static int orangefs_flush(struct file *file, fl_owner_t id) * on an explicit fsync call. This duplicates historical OrangeFS * behavior. */ - struct inode *inode = file->f_mapping->host; int r; - kfree(file->private_data); - file->private_data = NULL; - - if (inode->i_state & I_DIRTY_TIME) { - spin_lock(&inode->i_lock); - inode->i_state &= ~I_DIRTY_TIME; - spin_unlock(&inode->i_lock); - mark_inode_dirty_sync(inode); - } - r = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX); if (r > 0) return 0; @@ -694,7 +662,7 @@ const struct file_operations orangefs_file_operations = { .lock = orangefs_lock, .unlocked_ioctl = orangefs_ioctl, .mmap = orangefs_file_mmap, - .open = orangefs_file_open, + .open = generic_file_open, .flush = orangefs_flush, .release = orangefs_file_release, .fsync = orangefs_fsync, diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 961c0fd8675a..12ae630fbed7 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -259,46 +259,19 @@ static int orangefs_readpage(struct file *file, struct page *page) pgoff_t index; /* which page */ struct page *next_page; char *kaddr; - struct orangefs_read_options *ro = file->private_data; loff_t read_size; - loff_t roundedup; int buffer_index = -1; /* orangefs shared memory slot */ int slot_index; /* index into slot */ int remaining; /* - * If they set some miniscule size for "count" in read(2) - * (for example) then let's try to read a page, or the whole file - * if it is smaller than a page. Once "count" goes over a page - * then lets round up to the highest page size multiple that is - * less than or equal to "count" and do that much orangefs IO and - * try to fill as many pages as we can from it. - * - * "count" should be represented in ro->blksiz. - * - * inode->i_size = file size. + * Get up to this many bytes from Orangefs at a time and try + * to fill them into the page cache at once. Tests with dd made + * this seem like a reasonable static number, if there was + * interest perhaps this number could be made setable through + * sysfs... */ - if (ro) { - if (ro->blksiz < PAGE_SIZE) { - if (inode->i_size < PAGE_SIZE) - read_size = inode->i_size; - else - read_size = PAGE_SIZE; - } else { - roundedup = ((PAGE_SIZE - 1) & ro->blksiz) ? - ((ro->blksiz + PAGE_SIZE) & ~(PAGE_SIZE -1)) : - ro->blksiz; - if (roundedup > inode->i_size) - read_size = inode->i_size; - else - read_size = roundedup; - - } - } else { - read_size = PAGE_SIZE; - } - if (!read_size) - read_size = PAGE_SIZE; + read_size = 524288; if (PageDirty(page)) orangefs_launder_page(page); diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index ed67f39fa7ce..e12aeb9623d6 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -239,10 +239,6 @@ struct orangefs_write_range { kgid_t gid; }; -struct orangefs_read_options { - ssize_t blksiz; -}; - extern struct orangefs_stats orangefs_stats; /* diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 9fc47c2e078d..9709cf22cab3 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -36,6 +36,13 @@ static int ovl_ccup_get(char *buf, const struct kernel_param *param) module_param_call(check_copy_up, ovl_ccup_set, ovl_ccup_get, NULL, 0644); MODULE_PARM_DESC(check_copy_up, "Obsolete; does nothing"); +static bool ovl_must_copy_xattr(const char *name) +{ + return !strcmp(name, XATTR_POSIX_ACL_ACCESS) || + !strcmp(name, XATTR_POSIX_ACL_DEFAULT) || + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); +} + int ovl_copy_xattr(struct dentry *old, struct dentry *new) { ssize_t list_size, size, value_size = 0; @@ -107,8 +114,13 @@ retry: continue; /* Discard */ } error = vfs_setxattr(new, name, value, size, 0); - if (error) - break; + if (error) { + if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name)) + break; + + /* Ignore failure to copy unknown xattrs */ + error = 0; + } } kfree(value); out: diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 8e57d5372b8f..279009dee366 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -42,7 +42,7 @@ int ovl_cleanup(struct inode *wdir, struct dentry *wdentry) return err; } -static struct dentry *ovl_lookup_temp(struct dentry *workdir) +struct dentry *ovl_lookup_temp(struct dentry *workdir) { struct dentry *temp; char name[20]; @@ -243,6 +243,9 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode, ovl_dir_modified(dentry->d_parent, false); ovl_dentry_set_upper_alias(dentry); + ovl_dentry_update_reval(dentry, newdentry, + DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); + if (!hardlink) { /* * ovl_obtain_alias() can be called after ovl_create_real() @@ -819,6 +822,28 @@ static bool ovl_pure_upper(struct dentry *dentry) !ovl_test_flag(OVL_WHITEOUTS, d_inode(dentry)); } +static void ovl_drop_nlink(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct dentry *alias; + + /* Try to find another, hashed alias */ + spin_lock(&inode->i_lock); + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + if (alias != dentry && !d_unhashed(alias)) + break; + } + spin_unlock(&inode->i_lock); + + /* + * Changes to underlying layers may cause i_nlink to lose sync with + * reality. In this case prevent the link count from going to zero + * prematurely. + */ + if (inode->i_nlink > !!alias) + drop_nlink(inode); +} + static int ovl_do_remove(struct dentry *dentry, bool is_dir) { int err; @@ -856,7 +881,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) if (is_dir) clear_nlink(dentry->d_inode); else - drop_nlink(dentry->d_inode); + ovl_drop_nlink(dentry); } ovl_nlink_end(dentry); @@ -1201,7 +1226,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, if (new_is_dir) clear_nlink(d_inode(new)); else - drop_nlink(d_inode(new)); + ovl_drop_nlink(new); } ovl_dir_modified(old->d_parent, ovl_type_origin(old) || diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 6f54d70cef27..475c61f53f0f 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -308,29 +308,35 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb, ovl_set_flag(OVL_UPPERDATA, inode); dentry = d_find_any_alias(inode); - if (!dentry) { - dentry = d_alloc_anon(inode->i_sb); - if (!dentry) - goto nomem; - oe = ovl_alloc_entry(lower ? 1 : 0); - if (!oe) - goto nomem; - - if (lower) { - oe->lowerstack->dentry = dget(lower); - oe->lowerstack->layer = lowerpath->layer; - } - dentry->d_fsdata = oe; - if (upper_alias) - ovl_dentry_set_upper_alias(dentry); + if (dentry) + goto out_iput; + + dentry = d_alloc_anon(inode->i_sb); + if (unlikely(!dentry)) + goto nomem; + oe = ovl_alloc_entry(lower ? 1 : 0); + if (!oe) + goto nomem; + + if (lower) { + oe->lowerstack->dentry = dget(lower); + oe->lowerstack->layer = lowerpath->layer; } + dentry->d_fsdata = oe; + if (upper_alias) + ovl_dentry_set_upper_alias(dentry); + + ovl_dentry_update_reval(dentry, upper, + DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); return d_instantiate_anon(dentry, inode); nomem: - iput(inode); dput(dentry); - return ERR_PTR(-ENOMEM); + dentry = ERR_PTR(-ENOMEM); +out_iput: + iput(inode); + return dentry; } /* Get the upper or lower dentry in stach whose on layer @idx */ diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 79e8994e3bc1..b0d42ece4d7c 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -79,6 +79,7 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) { bool samefs = ovl_same_fs(dentry->d_sb); unsigned int xinobits = ovl_xino_bits(dentry->d_sb); + unsigned int xinoshift = 64 - xinobits; if (samefs) { /* @@ -89,22 +90,22 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) stat->dev = dentry->d_sb->s_dev; return 0; } else if (xinobits) { - unsigned int shift = 64 - xinobits; /* * All inode numbers of underlying fs should not be using the * high xinobits, so we use high xinobits to partition the * overlay st_ino address space. The high bits holds the fsid - * (upper fsid is 0). This way overlay inode numbers are unique - * and all inodes use overlay st_dev. Inode numbers are also - * persistent for a given layer configuration. + * (upper fsid is 0). The lowest xinobit is reserved for mapping + * the non-peresistent inode numbers range in case of overflow. + * This way all overlay inode numbers are unique and use the + * overlay st_dev. */ - if (stat->ino >> shift) { - pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n", - dentry, stat->ino, xinobits); - } else { - stat->ino |= ((u64)fsid) << shift; + if (likely(!(stat->ino >> xinoshift))) { + stat->ino |= ((u64)fsid) << (xinoshift + 1); stat->dev = dentry->d_sb->s_dev; return 0; + } else if (ovl_xino_warn(dentry->d_sb)) { + pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n", + dentry, stat->ino, xinobits); } } @@ -504,7 +505,7 @@ static const struct address_space_operations ovl_aops = { /* * It is possible to stack overlayfs instance on top of another - * overlayfs instance as lower layer. We need to annonate the + * overlayfs instance as lower layer. We need to annotate the * stackable i_mutex locks according to stack level of the super * block instance. An overlayfs instance can never be in stack * depth 0 (there is always a real fs below it). An overlayfs @@ -561,27 +562,73 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) #endif } -static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev, - unsigned long ino, int fsid) +static void ovl_next_ino(struct inode *inode) +{ + struct ovl_fs *ofs = inode->i_sb->s_fs_info; + + inode->i_ino = atomic_long_inc_return(&ofs->last_ino); + if (unlikely(!inode->i_ino)) + inode->i_ino = atomic_long_inc_return(&ofs->last_ino); +} + +static void ovl_map_ino(struct inode *inode, unsigned long ino, int fsid) { int xinobits = ovl_xino_bits(inode->i_sb); + unsigned int xinoshift = 64 - xinobits; /* * When d_ino is consistent with st_ino (samefs or i_ino has enough * bits to encode layer), set the same value used for st_ino to i_ino, * so inode number exposed via /proc/locks and a like will be * consistent with d_ino and st_ino values. An i_ino value inconsistent - * with d_ino also causes nfsd readdirplus to fail. When called from - * ovl_new_inode(), ino arg is 0, so i_ino will be updated to real - * upper inode i_ino on ovl_inode_init() or ovl_inode_update(). + * with d_ino also causes nfsd readdirplus to fail. */ - if (ovl_same_dev(inode->i_sb)) { - inode->i_ino = ino; - if (xinobits && fsid && !(ino >> (64 - xinobits))) - inode->i_ino |= (unsigned long)fsid << (64 - xinobits); - } else { - inode->i_ino = get_next_ino(); + inode->i_ino = ino; + if (ovl_same_fs(inode->i_sb)) { + return; + } else if (xinobits && likely(!(ino >> xinoshift))) { + inode->i_ino |= (unsigned long)fsid << (xinoshift + 1); + return; + } + + /* + * For directory inodes on non-samefs with xino disabled or xino + * overflow, we allocate a non-persistent inode number, to be used for + * resolving st_ino collisions in ovl_map_dev_ino(). + * + * To avoid ino collision with legitimate xino values from upper + * layer (fsid 0), use the lowest xinobit to map the non + * persistent inode numbers to the unified st_ino address space. + */ + if (S_ISDIR(inode->i_mode)) { + ovl_next_ino(inode); + if (xinobits) { + inode->i_ino &= ~0UL >> xinobits; + inode->i_ino |= 1UL << xinoshift; + } } +} + +void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip, + unsigned long ino, int fsid) +{ + struct inode *realinode; + + if (oip->upperdentry) + OVL_I(inode)->__upperdentry = oip->upperdentry; + if (oip->lowerpath && oip->lowerpath->dentry) + OVL_I(inode)->lower = igrab(d_inode(oip->lowerpath->dentry)); + if (oip->lowerdata) + OVL_I(inode)->lowerdata = igrab(d_inode(oip->lowerdata)); + + realinode = ovl_inode_real(inode); + ovl_copyattr(realinode, inode); + ovl_copyflags(realinode, inode); + ovl_map_ino(inode, ino, fsid); +} + +static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) +{ inode->i_mode = mode; inode->i_flags |= S_NOCMTIME; #ifdef CONFIG_FS_POSIX_ACL @@ -719,7 +766,7 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev) inode = new_inode(sb); if (inode) - ovl_fill_inode(inode, mode, rdev, 0, 0); + ovl_fill_inode(inode, mode, rdev); return inode; } @@ -891,7 +938,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL; bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, oip->index); - int fsid = bylower ? oip->lowerpath->layer->fsid : 0; + int fsid = bylower ? lowerpath->layer->fsid : 0; bool is_dir, metacopy = false; unsigned long ino = 0; int err = oip->newinode ? -EEXIST : -ENOMEM; @@ -941,9 +988,11 @@ struct inode *ovl_get_inode(struct super_block *sb, err = -ENOMEM; goto out_err; } + ino = realinode->i_ino; + fsid = lowerpath->layer->fsid; } - ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino, fsid); - ovl_inode_init(inode, upperdentry, lowerdentry, oip->lowerdata); + ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev); + ovl_inode_init(inode, oip, ino, fsid); if (upperdentry && ovl_is_impuredir(upperdentry)) ovl_set_flag(OVL_IMPURE, inode); diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index ed9e129fae04..0db23baf98e7 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -845,7 +845,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (err) goto out; - if (upperdentry && unlikely(ovl_dentry_remote(upperdentry))) { + if (upperdentry && upperdentry->d_flags & DCACHE_OP_REAL) { dput(upperdentry); err = -EREMOTE; goto out; @@ -1076,6 +1076,9 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, goto out_free_oe; } + ovl_dentry_update_reval(dentry, upperdentry, + DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); + revert_creds(old_cred); if (origin_path) { dput(origin_path->dentry); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 3d3f2b8bdae5..e6f3670146ed 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -48,6 +48,12 @@ enum ovl_entry_flag { OVL_E_CONNECTED, }; +enum { + OVL_XINO_OFF, + OVL_XINO_AUTO, + OVL_XINO_ON, +}; + /* * The tuple (fh,uuid) is a universal unique identifier for a copy up origin, * where: @@ -87,7 +93,7 @@ struct ovl_fb { u8 flags; /* OVL_FH_FLAG_* */ u8 type; /* fid_type of fid */ uuid_t uuid; /* uuid of filesystem */ - u32 fid[0]; /* file identifier should be 32bit aligned in-memory */ + u32 fid[]; /* file identifier should be 32bit aligned in-memory */ } __packed; /* In-memory and on-wire format for overlay file handle */ @@ -230,6 +236,8 @@ bool ovl_index_all(struct super_block *sb); bool ovl_verify_lower(struct super_block *sb); struct ovl_entry *ovl_alloc_entry(unsigned int numlower); bool ovl_dentry_remote(struct dentry *dentry); +void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *upperdentry, + unsigned int mask); bool ovl_dentry_weird(struct dentry *dentry); enum ovl_path_type ovl_path_type(struct dentry *dentry); void ovl_path_upper(struct dentry *dentry, struct path *path); @@ -264,8 +272,6 @@ void ovl_set_upperdata(struct inode *inode); bool ovl_redirect_dir(struct super_block *sb); const char *ovl_dentry_get_redirect(struct dentry *dentry); void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); -void ovl_inode_init(struct inode *inode, struct dentry *upperdentry, - struct dentry *lowerdentry, struct dentry *lowerdata); void ovl_inode_update(struct inode *inode, struct dentry *upperdentry); void ovl_dir_modified(struct dentry *dentry, bool impurity); u64 ovl_dentry_version_get(struct dentry *dentry); @@ -301,6 +307,16 @@ static inline bool ovl_is_impuredir(struct dentry *dentry) return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE); } +/* + * With xino=auto, we do best effort to keep all inodes on same st_dev and + * d_ino consistent with st_ino. + * With xino=on, we do the same effort but we warn if we failed. + */ +static inline bool ovl_xino_warn(struct super_block *sb) +{ + return OVL_FS(sb)->config.xino == OVL_XINO_ON; +} + /* All layers on same fs? */ static inline bool ovl_same_fs(struct super_block *sb) { @@ -410,6 +426,8 @@ struct ovl_inode_params { char *redirect; struct dentry *lowerdata; }; +void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip, + unsigned long ino, int fsid); struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev); struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, bool is_upper); @@ -451,6 +469,7 @@ struct ovl_cattr { struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry, struct ovl_cattr *attr); int ovl_cleanup(struct inode *dir, struct dentry *dentry); +struct dentry *ovl_lookup_temp(struct dentry *workdir); struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr); /* file.c */ diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 89015ea822e7..5762d802fe01 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -75,6 +75,8 @@ struct ovl_fs { struct inode *indexdir_trap; /* -1: disabled, 0: same fs, 1..32: number of unused ino bits */ int xino_mode; + /* For allocation of non-persistent inode numbers */ + atomic_long_t last_ino; }; static inline struct ovl_fs *OVL_FS(struct super_block *sb) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 40ac9ce2465a..e452ff7d583d 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -438,15 +438,23 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) /* Map inode number to lower fs unique range */ static u64 ovl_remap_lower_ino(u64 ino, int xinobits, int fsid, - const char *name, int namelen) + const char *name, int namelen, bool warn) { - if (ino >> (64 - xinobits)) { - pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n", - namelen, name, ino, xinobits); + unsigned int xinoshift = 64 - xinobits; + + if (unlikely(ino >> xinoshift)) { + if (warn) { + pr_warn_ratelimited("d_ino too big (%.*s, ino=%llu, xinobits=%d)\n", + namelen, name, ino, xinobits); + } return ino; } - return ino | ((u64)fsid) << (64 - xinobits); + /* + * The lowest xinobit is reserved for mapping the non-peresistent inode + * numbers range, but this range is only exposed via st_ino, not here. + */ + return ino | ((u64)fsid) << (xinoshift + 1); } /* @@ -515,7 +523,8 @@ get: } else if (xinobits && !OVL_TYPE_UPPER(type)) { ino = ovl_remap_lower_ino(ino, xinobits, ovl_layer_lower(this)->fsid, - p->name, p->len); + p->name, p->len, + ovl_xino_warn(dir->d_sb)); } out: @@ -645,6 +654,7 @@ struct ovl_readdir_translate { u64 parent_ino; int fsid; int xinobits; + bool xinowarn; }; static int ovl_fill_real(struct dir_context *ctx, const char *name, @@ -665,7 +675,7 @@ static int ovl_fill_real(struct dir_context *ctx, const char *name, ino = p->ino; } else if (rdt->xinobits) { ino = ovl_remap_lower_ino(ino, rdt->xinobits, rdt->fsid, - name, namelen); + name, namelen, rdt->xinowarn); } return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type); @@ -696,6 +706,7 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx) .ctx.actor = ovl_fill_real, .orig_ctx = ctx, .xinobits = ovl_xino_bits(dir->d_sb), + .xinowarn = ovl_xino_warn(dir->d_sb), }; if (rdt.xinobits && lower_layer) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index ac967f1cb6e5..732ad5495c92 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -113,53 +113,54 @@ bug: return dentry; } -static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags) +static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak) { - struct ovl_entry *oe = dentry->d_fsdata; - unsigned int i; int ret = 1; - for (i = 0; i < oe->numlower; i++) { - struct dentry *d = oe->lowerstack[i].dentry; - - if (d->d_flags & DCACHE_OP_REVALIDATE) { - ret = d->d_op->d_revalidate(d, flags); - if (ret < 0) - return ret; - if (!ret) { - if (!(flags & LOOKUP_RCU)) - d_invalidate(d); - return -ESTALE; - } + if (weak) { + if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) + ret = d->d_op->d_weak_revalidate(d, flags); + } else if (d->d_flags & DCACHE_OP_REVALIDATE) { + ret = d->d_op->d_revalidate(d, flags); + if (!ret) { + if (!(flags & LOOKUP_RCU)) + d_invalidate(d); + ret = -ESTALE; } } - return 1; + return ret; } -static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags) +static int ovl_dentry_revalidate_common(struct dentry *dentry, + unsigned int flags, bool weak) { struct ovl_entry *oe = dentry->d_fsdata; + struct dentry *upper; unsigned int i; int ret = 1; - for (i = 0; i < oe->numlower; i++) { - struct dentry *d = oe->lowerstack[i].dentry; + upper = ovl_dentry_upper(dentry); + if (upper) + ret = ovl_revalidate_real(upper, flags, weak); - if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) { - ret = d->d_op->d_weak_revalidate(d, flags); - if (ret <= 0) - break; - } + for (i = 0; ret > 0 && i < oe->numlower; i++) { + ret = ovl_revalidate_real(oe->lowerstack[i].dentry, flags, + weak); } return ret; } -static const struct dentry_operations ovl_dentry_operations = { - .d_release = ovl_dentry_release, - .d_real = ovl_d_real, -}; +static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags) +{ + return ovl_dentry_revalidate_common(dentry, flags, false); +} + +static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags) +{ + return ovl_dentry_revalidate_common(dentry, flags, true); +} -static const struct dentry_operations ovl_reval_dentry_operations = { +static const struct dentry_operations ovl_dentry_operations = { .d_release = ovl_dentry_release, .d_real = ovl_d_real, .d_revalidate = ovl_dentry_revalidate, @@ -316,12 +317,6 @@ static const char *ovl_redirect_mode_def(void) return ovl_redirect_dir_def ? "on" : "off"; } -enum { - OVL_XINO_OFF, - OVL_XINO_AUTO, - OVL_XINO_ON, -}; - static const char * const ovl_xino_str[] = { "off", "auto", @@ -751,13 +746,12 @@ static int ovl_mount_dir(const char *name, struct path *path) ovl_unescape(tmp); err = ovl_mount_dir_noesc(tmp, path); - if (!err) - if (ovl_dentry_remote(path->dentry)) { - pr_err("filesystem on '%s' not supported as upperdir\n", - tmp); - path_put_init(path); - err = -EINVAL; - } + if (!err && path->dentry->d_flags & DCACHE_OP_REAL) { + pr_err("filesystem on '%s' not supported as upperdir\n", + tmp); + path_put_init(path); + err = -EINVAL; + } kfree(tmp); } return err; @@ -778,7 +772,7 @@ static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs, } static int ovl_lower_dir(const char *name, struct path *path, - struct ovl_fs *ofs, int *stack_depth, bool *remote) + struct ovl_fs *ofs, int *stack_depth) { int fh_type; int err; @@ -793,9 +787,6 @@ static int ovl_lower_dir(const char *name, struct path *path, *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth); - if (ovl_dentry_remote(path->dentry)) - *remote = true; - /* * The inodes index feature and NFS export need to encode and decode * file handles, so they require that all layers support them. @@ -1074,11 +1065,73 @@ out: return err; } +/* + * Returns 1 if RENAME_WHITEOUT is supported, 0 if not supported and + * negative values if error is encountered. + */ +static int ovl_check_rename_whiteout(struct dentry *workdir) +{ + struct inode *dir = d_inode(workdir); + struct dentry *temp; + struct dentry *dest; + struct dentry *whiteout; + struct name_snapshot name; + int err; + + inode_lock_nested(dir, I_MUTEX_PARENT); + + temp = ovl_create_temp(workdir, OVL_CATTR(S_IFREG | 0)); + err = PTR_ERR(temp); + if (IS_ERR(temp)) + goto out_unlock; + + dest = ovl_lookup_temp(workdir); + err = PTR_ERR(dest); + if (IS_ERR(dest)) { + dput(temp); + goto out_unlock; + } + + /* Name is inline and stable - using snapshot as a copy helper */ + take_dentry_name_snapshot(&name, temp); + err = ovl_do_rename(dir, temp, dir, dest, RENAME_WHITEOUT); + if (err) { + if (err == -EINVAL) + err = 0; + goto cleanup_temp; + } + + whiteout = lookup_one_len(name.name.name, workdir, name.name.len); + err = PTR_ERR(whiteout); + if (IS_ERR(whiteout)) + goto cleanup_temp; + + err = ovl_is_whiteout(whiteout); + + /* Best effort cleanup of whiteout and temp file */ + if (err) + ovl_cleanup(dir, whiteout); + dput(whiteout); + +cleanup_temp: + ovl_cleanup(dir, temp); + release_dentry_name_snapshot(&name); + dput(temp); + dput(dest); + +out_unlock: + inode_unlock(dir); + + return err; +} + static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, struct path *workpath) { struct vfsmount *mnt = ofs->upper_mnt; struct dentry *temp; + bool rename_whiteout; + bool d_type; int fh_type; int err; @@ -1104,11 +1157,8 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, if (err < 0) goto out; - /* - * We allowed this configuration and don't want to break users over - * kernel upgrade. So warn instead of erroring out. - */ - if (!err) + d_type = err; + if (!d_type) pr_warn("upper fs needs to support d_type.\n"); /* Check if upper/work fs supports O_TMPFILE */ @@ -1119,6 +1169,16 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, else pr_warn("upper fs does not support tmpfile.\n"); + + /* Check if upper/work fs supports RENAME_WHITEOUT */ + err = ovl_check_rename_whiteout(ofs->workdir); + if (err < 0) + goto out; + + rename_whiteout = err; + if (!rename_whiteout) + pr_warn("upper fs does not support RENAME_WHITEOUT.\n"); + /* * Check if upper/work fs supports trusted.overlay.* xattr */ @@ -1133,6 +1193,18 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs, vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE); } + /* + * We allowed sub-optimal upper fs configuration and don't want to break + * users over kernel upgrade, but we never allowed remote upper fs, so + * we can enforce strict requirements for remote upper fs. + */ + if (ovl_dentry_remote(ofs->workdir) && + (!d_type || !rename_whiteout || ofs->noxattr)) { + pr_err("upper fs missing required features.\n"); + err = -EINVAL; + goto out; + } + /* Check if upper/work fs supports file handles */ fh_type = ovl_can_decode_fh(ofs->workdir->d_sb); if (ofs->config.index && !fh_type) { @@ -1401,11 +1473,12 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, /* * When all layers on same fs, overlay can use real inode numbers. - * With mount option "xino=on", mounter declares that there are enough - * free high bits in underlying fs to hold the unique fsid. + * With mount option "xino=<on|auto>", mounter declares that there are + * enough free high bits in underlying fs to hold the unique fsid. * If overlayfs does encounter underlying inodes using the high xino * bits reserved for fsid, it emits a warning and uses the original - * inode number. + * inode number or a non persistent inode number allocated from a + * dedicated range. */ if (ofs->numfs - !ofs->upper_mnt == 1) { if (ofs->config.xino == OVL_XINO_ON) @@ -1413,14 +1486,16 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, ofs->xino_mode = 0; } else if (ofs->config.xino == OVL_XINO_OFF) { ofs->xino_mode = -1; - } else if (ofs->config.xino == OVL_XINO_ON && ofs->xino_mode < 0) { + } else if (ofs->xino_mode < 0) { /* * This is a roundup of number of bits needed for encoding - * fsid, where fsid 0 is reserved for upper fs even with - * lower only overlay. + * fsid, where fsid 0 is reserved for upper fs (even with + * lower only overlay) +1 extra bit is reserved for the non + * persistent inode number range that is used for resolving + * xino lower bits overflow. */ - BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 31); - ofs->xino_mode = ilog2(ofs->numfs - 1) + 1; + BUILD_BUG_ON(ilog2(OVL_MAX_STACK) > 30); + ofs->xino_mode = ilog2(ofs->numfs - 1) + 2; } if (ofs->xino_mode > 0) { @@ -1440,7 +1515,6 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, char *lowertmp, *lower; struct path *stack = NULL; unsigned int stacklen, numlower = 0, i; - bool remote = false; struct ovl_entry *oe; err = -ENOMEM; @@ -1472,7 +1546,7 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, lower = lowertmp; for (numlower = 0; numlower < stacklen; numlower++) { err = ovl_lower_dir(lower, &stack[numlower], ofs, - &sb->s_stack_depth, &remote); + &sb->s_stack_depth); if (err) goto out_err; @@ -1500,11 +1574,6 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, oe->lowerstack[i].layer = &ofs->layers[i+1]; } - if (remote) - sb->s_d_op = &ovl_reval_dentry_operations; - else - sb->s_d_op = &ovl_dentry_operations; - out: for (i = 0; i < numlower; i++) path_put(&stack[i]); @@ -1589,6 +1658,44 @@ static int ovl_check_overlapping_layers(struct super_block *sb, return 0; } +static struct dentry *ovl_get_root(struct super_block *sb, + struct dentry *upperdentry, + struct ovl_entry *oe) +{ + struct dentry *root; + struct ovl_path *lowerpath = &oe->lowerstack[0]; + unsigned long ino = d_inode(lowerpath->dentry)->i_ino; + int fsid = lowerpath->layer->fsid; + struct ovl_inode_params oip = { + .upperdentry = upperdentry, + .lowerpath = lowerpath, + }; + + root = d_make_root(ovl_new_inode(sb, S_IFDIR, 0)); + if (!root) + return NULL; + + root->d_fsdata = oe; + + if (upperdentry) { + /* Root inode uses upper st_ino/i_ino */ + ino = d_inode(upperdentry)->i_ino; + fsid = 0; + ovl_dentry_set_upper_alias(root); + if (ovl_is_impuredir(upperdentry)) + ovl_set_flag(OVL_IMPURE, d_inode(root)); + } + + /* Root is always merge -> can have whiteouts */ + ovl_set_flag(OVL_WHITEOUTS, d_inode(root)); + ovl_dentry_set_flag(OVL_E_CONNECTED, root); + ovl_set_upperdata(d_inode(root)); + ovl_inode_init(d_inode(root), &oip, ino, fsid); + ovl_dentry_update_reval(root, upperdentry, DCACHE_OP_WEAK_REVALIDATE); + + return root; +} + static int ovl_fill_super(struct super_block *sb, void *data, int silent) { struct path upperpath = { }; @@ -1598,6 +1705,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) struct cred *cred; int err; + sb->s_d_op = &ovl_dentry_operations; + err = -ENOMEM; ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL); if (!ofs) @@ -1624,6 +1733,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_stack_depth = 0; sb->s_maxbytes = MAX_LFS_FILESIZE; + atomic_long_set(&ofs->last_ino, 1); /* Assume underlaying fs uses 32bit inodes unless proven otherwise */ if (ofs->config.xino != OVL_XINO_OFF) { ofs->xino_mode = BITS_PER_LONG - 32; @@ -1710,25 +1820,11 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags |= SB_POSIXACL; err = -ENOMEM; - root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0)); + root_dentry = ovl_get_root(sb, upperpath.dentry, oe); if (!root_dentry) goto out_free_oe; - root_dentry->d_fsdata = oe; - mntput(upperpath.mnt); - if (upperpath.dentry) { - ovl_dentry_set_upper_alias(root_dentry); - if (ovl_is_impuredir(upperpath.dentry)) - ovl_set_flag(OVL_IMPURE, d_inode(root_dentry)); - } - - /* Root is always merge -> can have whiteouts */ - ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry)); - ovl_dentry_set_flag(OVL_E_CONNECTED, root_dentry); - ovl_set_upperdata(d_inode(root_dentry)); - ovl_inode_init(d_inode(root_dentry), upperpath.dentry, - ovl_dentry_lower(root_dentry), NULL); sb->s_root = root_dentry; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 042f7eb4f7f4..36b60788ee47 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -93,8 +93,24 @@ struct ovl_entry *ovl_alloc_entry(unsigned int numlower) bool ovl_dentry_remote(struct dentry *dentry) { return dentry->d_flags & - (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | - DCACHE_OP_REAL); + (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); +} + +void ovl_dentry_update_reval(struct dentry *dentry, struct dentry *upperdentry, + unsigned int mask) +{ + struct ovl_entry *oe = OVL_E(dentry); + unsigned int i, flags = 0; + + if (upperdentry) + flags |= upperdentry->d_flags; + for (i = 0; i < oe->numlower; i++) + flags |= oe->lowerstack[i].dentry->d_flags; + + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~mask; + dentry->d_flags |= flags & mask; + spin_unlock(&dentry->d_lock); } bool ovl_dentry_weird(struct dentry *dentry) @@ -386,24 +402,6 @@ void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect) oi->redirect = redirect; } -void ovl_inode_init(struct inode *inode, struct dentry *upperdentry, - struct dentry *lowerdentry, struct dentry *lowerdata) -{ - struct inode *realinode = d_inode(upperdentry ?: lowerdentry); - - if (upperdentry) - OVL_I(inode)->__upperdentry = upperdentry; - if (lowerdentry) - OVL_I(inode)->lower = igrab(d_inode(lowerdentry)); - if (lowerdata) - OVL_I(inode)->lowerdata = igrab(d_inode(lowerdata)); - - ovl_copyattr(realinode, inode); - ovl_copyflags(realinode, inode); - if (!inode->i_ino) - inode->i_ino = realinode->i_ino; -} - void ovl_inode_update(struct inode *inode, struct dentry *upperdentry) { struct inode *upperinode = d_inode(upperdentry); @@ -416,8 +414,6 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry) smp_wmb(); OVL_I(inode)->__upperdentry = upperdentry; if (inode_unhashed(inode)) { - if (!inode->i_ino) - inode->i_ino = upperinode->i_ino; inode->i_private = upperinode; __insert_inode_hash(inode, (unsigned long) upperinode); } diff --git a/fs/pipe.c b/fs/pipe.c index 2144507447c5..16fb72e9abf7 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -146,7 +146,7 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, struct page *page = buf->page; if (page_count(page) == 1) { - memcg_kmem_uncharge(page, 0); + memcg_kmem_uncharge_page(page, 0); __SetPageLocked(page); return 0; } diff --git a/fs/proc/array.c b/fs/proc/array.c index 5efaf3708ec6..8e16f14bb05a 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -635,28 +635,35 @@ int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - unsigned long size = 0, resident = 0, shared = 0, text = 0, data = 0; struct mm_struct *mm = get_task_mm(task); if (mm) { + unsigned long size; + unsigned long resident = 0; + unsigned long shared = 0; + unsigned long text = 0; + unsigned long data = 0; + size = task_statm(mm, &shared, &text, &data, &resident); mmput(mm); - } - /* - * For quick read, open code by putting numbers directly - * expected format is - * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", - * size, resident, shared, text, data); - */ - seq_put_decimal_ull(m, "", size); - seq_put_decimal_ull(m, " ", resident); - seq_put_decimal_ull(m, " ", shared); - seq_put_decimal_ull(m, " ", text); - seq_put_decimal_ull(m, " ", 0); - seq_put_decimal_ull(m, " ", data); - seq_put_decimal_ull(m, " ", 0); - seq_putc(m, '\n'); + /* + * For quick read, open code by putting numbers directly + * expected format is + * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n", + * size, resident, shared, text, data); + */ + seq_put_decimal_ull(m, "", size); + seq_put_decimal_ull(m, " ", resident); + seq_put_decimal_ull(m, " ", shared); + seq_put_decimal_ull(m, " ", text); + seq_put_decimal_ull(m, " ", 0); + seq_put_decimal_ull(m, " ", data); + seq_put_decimal_ull(m, " ", 0); + seq_putc(m, '\n'); + } else { + seq_write(m, "0 0 0 0 0 0 0\n", 14); + } return 0; } diff --git a/fs/proc/base.c b/fs/proc/base.c index c7c64272b0fa..6042b646ab27 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -405,11 +405,11 @@ print0: static int lock_trace(struct task_struct *task) { - int err = mutex_lock_killable(&task->signal->cred_guard_mutex); + int err = mutex_lock_killable(&task->signal->exec_update_mutex); if (err) return err; if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); return -EPERM; } return 0; @@ -417,7 +417,7 @@ static int lock_trace(struct task_struct *task) static void unlock_trace(struct task_struct *task) { - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); } #ifdef CONFIG_STACKTRACE @@ -1834,11 +1834,25 @@ void task_dump_owner(struct task_struct *task, umode_t mode, *rgid = gid; } +void proc_pid_evict_inode(struct proc_inode *ei) +{ + struct pid *pid = ei->pid; + + if (S_ISDIR(ei->vfs_inode.i_mode)) { + spin_lock(&pid->lock); + hlist_del_init_rcu(&ei->sibling_inodes); + spin_unlock(&pid->lock); + } + + put_pid(pid); +} + struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task, umode_t mode) { struct inode * inode; struct proc_inode *ei; + struct pid *pid; /* We need a new inode */ @@ -1856,10 +1870,18 @@ struct inode *proc_pid_make_inode(struct super_block * sb, /* * grab the reference to task. */ - ei->pid = get_task_pid(task, PIDTYPE_PID); - if (!ei->pid) + pid = get_task_pid(task, PIDTYPE_PID); + if (!pid) goto out_unlock; + /* Let the pid remember us for quick removal */ + ei->pid = pid; + if (S_ISDIR(mode)) { + spin_lock(&pid->lock); + hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes); + spin_unlock(&pid->lock); + } + task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); @@ -2861,7 +2883,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh unsigned long flags; int result; - result = mutex_lock_killable(&task->signal->cred_guard_mutex); + result = mutex_lock_killable(&task->signal->exec_update_mutex); if (result) return result; @@ -2897,7 +2919,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh result = 0; out_unlock: - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); return result; } @@ -3230,90 +3252,29 @@ static const struct inode_operations proc_tgid_base_inode_operations = { .permission = proc_pid_permission, }; -static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) -{ - struct dentry *dentry, *leader, *dir; - char buf[10 + 1]; - struct qstr name; - - name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%u", pid); - /* no ->d_hash() rejects on procfs */ - dentry = d_hash_and_lookup(mnt->mnt_root, &name); - if (dentry) { - d_invalidate(dentry); - dput(dentry); - } - - if (pid == tgid) - return; - - name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%u", tgid); - leader = d_hash_and_lookup(mnt->mnt_root, &name); - if (!leader) - goto out; - - name.name = "task"; - name.len = strlen(name.name); - dir = d_hash_and_lookup(leader, &name); - if (!dir) - goto out_put_leader; - - name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%u", pid); - dentry = d_hash_and_lookup(dir, &name); - if (dentry) { - d_invalidate(dentry); - dput(dentry); - } - - dput(dir); -out_put_leader: - dput(leader); -out: - return; -} - /** - * proc_flush_task - Remove dcache entries for @task from the /proc dcache. - * @task: task that should be flushed. + * proc_flush_pid - Remove dcache entries for @pid from the /proc dcache. + * @pid: pid that should be flushed. * - * When flushing dentries from proc, one needs to flush them from global - * proc (proc_mnt) and from all the namespaces' procs this task was seen - * in. This call is supposed to do all of this job. - * - * Looks in the dcache for - * /proc/@pid - * /proc/@tgid/task/@pid - * if either directory is present flushes it and all of it'ts children - * from the dcache. + * This function walks a list of inodes (that belong to any proc + * filesystem) that are attached to the pid and flushes them from + * the dentry cache. * * It is safe and reasonable to cache /proc entries for a task until * that task exits. After that they just clog up the dcache with * useless entries, possibly causing useful dcache entries to be - * flushed instead. This routine is proved to flush those useless - * dcache entries at process exit time. + * flushed instead. This routine is provided to flush those useless + * dcache entries when a process is reaped. * * NOTE: This routine is just an optimization so it does not guarantee - * that no dcache entries will exist at process exit time it - * just makes it very unlikely that any will persist. + * that no dcache entries will exist after a process is reaped + * it just makes it very unlikely that any will persist. */ -void proc_flush_task(struct task_struct *task) +void proc_flush_pid(struct pid *pid) { - int i; - struct pid *pid, *tgid; - struct upid *upid; - - pid = task_pid(task); - tgid = task_tgid(task); - - for (i = 0; i <= pid->level; i++) { - upid = &pid->numbers[i]; - proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, - tgid->numbers[i].nr); - } + proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock); + put_pid(pid); } static struct dentry *proc_pid_instantiate(struct dentry * dentry, diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index c1dea9b8222e..d0989a443c77 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -17,6 +17,7 @@ static int cpuinfo_open(struct inode *inode, struct file *file) } static const struct proc_ops cpuinfo_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = cpuinfo_open, .proc_read = seq_read, .proc_lseek = seq_lseek, diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 3faed94e4b65..4ed6dabdf6ff 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -531,6 +531,12 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, return p; } +static inline void pde_set_flags(struct proc_dir_entry *pde) +{ + if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT) + pde->flags |= PROC_ENTRY_PERMANENT; +} + struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct proc_ops *proc_ops, void *data) @@ -541,6 +547,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, if (!p) return NULL; p->proc_ops = proc_ops; + pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_data); @@ -572,6 +579,7 @@ static int proc_seq_release(struct inode *inode, struct file *file) } static const struct proc_ops proc_seq_ops = { + /* not permanent -- can call into arbitrary seq_operations */ .proc_open = proc_seq_open, .proc_read = seq_read, .proc_lseek = seq_lseek, @@ -602,6 +610,7 @@ static int proc_single_open(struct inode *inode, struct file *file) } static const struct proc_ops proc_single_ops = { + /* not permanent -- can call into arbitrary ->single_show */ .proc_open = proc_single_open, .proc_read = seq_read, .proc_lseek = seq_lseek, @@ -662,9 +671,13 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) de = pde_subdir_find(parent, fn, len); if (de) { - rb_erase(&de->subdir_node, &parent->subdir); - if (S_ISDIR(de->mode)) { - parent->nlink--; + if (unlikely(pde_is_permanent(de))) { + WARN(1, "removing permanent /proc entry '%s'", de->name); + de = NULL; + } else { + rb_erase(&de->subdir_node, &parent->subdir); + if (S_ISDIR(de->mode)) + parent->nlink--; } } write_unlock(&proc_subdir_lock); @@ -700,12 +713,24 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) write_unlock(&proc_subdir_lock); return -ENOENT; } + if (unlikely(pde_is_permanent(root))) { + write_unlock(&proc_subdir_lock); + WARN(1, "removing permanent /proc entry '%s/%s'", + root->parent->name, root->name); + return -EINVAL; + } rb_erase(&root->subdir_node, &parent->subdir); de = root; while (1) { next = pde_subdir_first(de); if (next) { + if (unlikely(pde_is_permanent(root))) { + write_unlock(&proc_subdir_lock); + WARN(1, "removing permanent /proc entry '%s/%s'", + next->parent->name, next->name); + return -EINVAL; + } rb_erase(&next->subdir_node, &de->subdir); de = next; continue; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 6da18316d209..fb4cace9ea41 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -33,21 +33,27 @@ static void proc_evict_inode(struct inode *inode) { struct proc_dir_entry *de; struct ctl_table_header *head; + struct proc_inode *ei = PROC_I(inode); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); /* Stop tracking associated processes */ - put_pid(PROC_I(inode)->pid); + if (ei->pid) { + proc_pid_evict_inode(ei); + ei->pid = NULL; + } /* Let go of any associated proc directory entry */ - de = PDE(inode); - if (de) + de = ei->pde; + if (de) { pde_put(de); + ei->pde = NULL; + } - head = PROC_I(inode)->sysctl; + head = ei->sysctl; if (head) { - RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); + RCU_INIT_POINTER(ei->sysctl, NULL); proc_sys_evict_inode(inode, head); } } @@ -68,6 +74,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei->pde = NULL; ei->sysctl = NULL; ei->sysctl_entry = NULL; + INIT_HLIST_NODE(&ei->sibling_inodes); ei->ns_ops = NULL; return &ei->vfs_inode; } @@ -102,6 +109,62 @@ void __init proc_init_kmemcache(void) BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE); } +void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock) +{ + struct inode *inode; + struct proc_inode *ei; + struct hlist_node *node; + struct super_block *old_sb = NULL; + + rcu_read_lock(); + for (;;) { + struct super_block *sb; + node = hlist_first_rcu(inodes); + if (!node) + break; + ei = hlist_entry(node, struct proc_inode, sibling_inodes); + spin_lock(lock); + hlist_del_init_rcu(&ei->sibling_inodes); + spin_unlock(lock); + + inode = &ei->vfs_inode; + sb = inode->i_sb; + if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active)) + continue; + inode = igrab(inode); + rcu_read_unlock(); + if (sb != old_sb) { + if (old_sb) + deactivate_super(old_sb); + old_sb = sb; + } + if (unlikely(!inode)) { + rcu_read_lock(); + continue; + } + + if (S_ISDIR(inode->i_mode)) { + struct dentry *dir = d_find_any_alias(inode); + if (dir) { + d_invalidate(dir); + dput(dir); + } + } else { + struct dentry *dentry; + while ((dentry = d_find_alias(inode))) { + d_invalidate(dentry); + dput(dentry); + } + } + iput(inode); + + rcu_read_lock(); + } + rcu_read_unlock(); + if (old_sb) + deactivate_super(old_sb); +} + static int proc_show_options(struct seq_file *seq, struct dentry *root) { struct super_block *sb = root->d_sb; @@ -139,6 +202,7 @@ static void unuse_pde(struct proc_dir_entry *pde) /* pde is locked on entry, unlocked on exit */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) + __releases(&pde->pde_unload_lock) { /* * close() (proc_reg_release()) can't delete an entry and proceed: @@ -195,135 +259,204 @@ void proc_entry_rundown(struct proc_dir_entry *de) spin_unlock(&de->pde_unload_lock); } +static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence) +{ + typeof_member(struct proc_ops, proc_lseek) lseek; + + lseek = pde->proc_ops->proc_lseek; + if (!lseek) + lseek = default_llseek; + return lseek(file, offset, whence); +} + static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) { struct proc_dir_entry *pde = PDE(file_inode(file)); loff_t rv = -EINVAL; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_lseek) lseek; - lseek = pde->proc_ops->proc_lseek; - if (!lseek) - lseek = default_llseek; - rv = lseek(file, offset, whence); + if (pde_is_permanent(pde)) { + return pde_lseek(pde, file, offset, whence); + } else if (use_pde(pde)) { + rv = pde_lseek(pde, file, offset, whence); unuse_pde(pde); } return rv; } +static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + typeof_member(struct proc_ops, proc_read) read; + + read = pde->proc_ops->proc_read; + if (read) + return read(file, buf, count, ppos); + return -EIO; +} + static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_read) read; - read = pde->proc_ops->proc_read; - if (read) - rv = read(file, buf, count, ppos); + if (pde_is_permanent(pde)) { + return pde_read(pde, file, buf, count, ppos); + } else if (use_pde(pde)) { + rv = pde_read(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } +static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + typeof_member(struct proc_ops, proc_write) write; + + write = pde->proc_ops->proc_write; + if (write) + return write(file, buf, count, ppos); + return -EIO; +} + static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct proc_dir_entry *pde = PDE(file_inode(file)); ssize_t rv = -EIO; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_write) write; - write = pde->proc_ops->proc_write; - if (write) - rv = write(file, buf, count, ppos); + if (pde_is_permanent(pde)) { + return pde_write(pde, file, buf, count, ppos); + } else if (use_pde(pde)) { + rv = pde_write(pde, file, buf, count, ppos); unuse_pde(pde); } return rv; } +static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts) +{ + typeof_member(struct proc_ops, proc_poll) poll; + + poll = pde->proc_ops->proc_poll; + if (poll) + return poll(file, pts); + return DEFAULT_POLLMASK; +} + static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) { struct proc_dir_entry *pde = PDE(file_inode(file)); __poll_t rv = DEFAULT_POLLMASK; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_poll) poll; - poll = pde->proc_ops->proc_poll; - if (poll) - rv = poll(file, pts); + if (pde_is_permanent(pde)) { + return pde_poll(pde, file, pts); + } else if (use_pde(pde)) { + rv = pde_poll(pde, file, pts); unuse_pde(pde); } return rv; } +static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) +{ + typeof_member(struct proc_ops, proc_ioctl) ioctl; + + ioctl = pde->proc_ops->proc_ioctl; + if (ioctl) + return ioctl(file, cmd, arg); + return -ENOTTY; +} + static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_ioctl) ioctl; - ioctl = pde->proc_ops->proc_ioctl; - if (ioctl) - rv = ioctl(file, cmd, arg); + if (pde_is_permanent(pde)) { + return pde_ioctl(pde, file, cmd, arg); + } else if (use_pde(pde)) { + rv = pde_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #ifdef CONFIG_COMPAT +static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) +{ + typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl; + + compat_ioctl = pde->proc_ops->proc_compat_ioctl; + if (compat_ioctl) + return compat_ioctl(file, cmd, arg); + return -ENOTTY; +} + static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct proc_dir_entry *pde = PDE(file_inode(file)); long rv = -ENOTTY; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl; - - compat_ioctl = pde->proc_ops->proc_compat_ioctl; - if (compat_ioctl) - rv = compat_ioctl(file, cmd, arg); + if (pde_is_permanent(pde)) { + return pde_compat_ioctl(pde, file, cmd, arg); + } else if (use_pde(pde)) { + rv = pde_compat_ioctl(pde, file, cmd, arg); unuse_pde(pde); } return rv; } #endif +static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma) +{ + typeof_member(struct proc_ops, proc_mmap) mmap; + + mmap = pde->proc_ops->proc_mmap; + if (mmap) + return mmap(file, vma); + return -EIO; +} + static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) { struct proc_dir_entry *pde = PDE(file_inode(file)); int rv = -EIO; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_mmap) mmap; - mmap = pde->proc_ops->proc_mmap; - if (mmap) - rv = mmap(file, vma); + if (pde_is_permanent(pde)) { + return pde_mmap(pde, file, vma); + } else if (use_pde(pde)) { + rv = pde_mmap(pde, file, vma); unuse_pde(pde); } return rv; } static unsigned long -proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, +pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - struct proc_dir_entry *pde = PDE(file_inode(file)); - unsigned long rv = -EIO; + typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; - if (use_pde(pde)) { - typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; - - get_area = pde->proc_ops->proc_get_unmapped_area; + get_area = pde->proc_ops->proc_get_unmapped_area; #ifdef CONFIG_MMU - if (!get_area) - get_area = current->mm->get_unmapped_area; + if (!get_area) + get_area = current->mm->get_unmapped_area; #endif + if (get_area) + return get_area(file, orig_addr, len, pgoff, flags); + return orig_addr; +} - if (get_area) - rv = get_area(file, orig_addr, len, pgoff, flags); - else - rv = orig_addr; +static unsigned long +proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct proc_dir_entry *pde = PDE(file_inode(file)); + unsigned long rv = -EIO; + + if (pde_is_permanent(pde)) { + return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); + } else if (use_pde(pde)) { + rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); unuse_pde(pde); } return rv; @@ -337,6 +470,13 @@ static int proc_reg_open(struct inode *inode, struct file *file) typeof_member(struct proc_ops, proc_release) release; struct pde_opener *pdeo; + if (pde_is_permanent(pde)) { + open = pde->proc_ops->proc_open; + if (open) + rv = open(inode, file); + return rv; + } + /* * Ensure that * 1) PDE's ->release hook will be called no matter what @@ -386,6 +526,17 @@ static int proc_reg_release(struct inode *inode, struct file *file) { struct proc_dir_entry *pde = PDE(inode); struct pde_opener *pdeo; + + if (pde_is_permanent(pde)) { + typeof_member(struct proc_ops, proc_release) release; + + release = pde->proc_ops->proc_release; + if (release) { + return release(inode, file); + } + return 0; + } + spin_lock(&pde->pde_unload_lock); list_for_each_entry(pdeo, &pde->pde_openers, lh) { if (pdeo->file == file) { diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 41587276798e..917cc85e3466 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -61,6 +61,7 @@ struct proc_dir_entry { struct rb_node subdir_node; char *name; umode_t mode; + u8 flags; u8 namelen; char inline_name[]; } __randomize_layout; @@ -73,6 +74,11 @@ struct proc_dir_entry { 0) #define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry)) +static inline bool pde_is_permanent(const struct proc_dir_entry *pde) +{ + return pde->flags & PROC_ENTRY_PERMANENT; +} + extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); @@ -91,7 +97,7 @@ struct proc_inode { struct proc_dir_entry *pde; struct ctl_table_header *sysctl; struct ctl_table *sysctl_entry; - struct hlist_node sysctl_inodes; + struct hlist_node sibling_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; } __randomize_layout; @@ -158,6 +164,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, extern const struct dentry_operations pid_dentry_operations; extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int); extern int proc_setattr(struct dentry *, struct iattr *); +extern void proc_pid_evict_inode(struct proc_inode *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); extern void pid_update_inode(struct task_struct *, struct inode *); extern int pid_delete_dentry(const struct dentry *); @@ -210,6 +217,7 @@ extern const struct inode_operations proc_pid_link_inode_operations; extern const struct super_operations proc_sops; void proc_init_kmemcache(void); +void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern void proc_entry_rundown(struct proc_dir_entry *); diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index ec1b7d2fb773..b38ad552887f 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -50,6 +50,7 @@ static __poll_t kmsg_poll(struct file *file, poll_table *wait) static const struct proc_ops kmsg_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_read = kmsg_read, .proc_poll = kmsg_poll, .proc_open = kmsg_open, diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index c75bb4632ed1..b6f5d459b087 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -267,42 +267,9 @@ static void unuse_table(struct ctl_table_header *p) complete(p->unregistering); } -static void proc_sys_prune_dcache(struct ctl_table_header *head) +static void proc_sys_invalidate_dcache(struct ctl_table_header *head) { - struct inode *inode; - struct proc_inode *ei; - struct hlist_node *node; - struct super_block *sb; - - rcu_read_lock(); - for (;;) { - node = hlist_first_rcu(&head->inodes); - if (!node) - break; - ei = hlist_entry(node, struct proc_inode, sysctl_inodes); - spin_lock(&sysctl_lock); - hlist_del_init_rcu(&ei->sysctl_inodes); - spin_unlock(&sysctl_lock); - - inode = &ei->vfs_inode; - sb = inode->i_sb; - if (!atomic_inc_not_zero(&sb->s_active)) - continue; - inode = igrab(inode); - rcu_read_unlock(); - if (unlikely(!inode)) { - deactivate_super(sb); - rcu_read_lock(); - continue; - } - - d_prune_aliases(inode); - iput(inode); - deactivate_super(sb); - - rcu_read_lock(); - } - rcu_read_unlock(); + proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock); } /* called under sysctl_lock, will reacquire if has to wait */ @@ -324,10 +291,10 @@ static void start_unregistering(struct ctl_table_header *p) spin_unlock(&sysctl_lock); } /* - * Prune dentries for unregistered sysctls: namespaced sysctls + * Invalidate dentries for unregistered sysctls: namespaced sysctls * can have duplicate names and contaminate dcache very badly. */ - proc_sys_prune_dcache(p); + proc_sys_invalidate_dcache(p); /* * do not remove from the list until nobody holds it; walking the * list in do_sysctl() relies on that. @@ -483,7 +450,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, } ei->sysctl = head; ei->sysctl_entry = table; - hlist_add_head_rcu(&ei->sysctl_inodes, &head->inodes); + hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes); head->count++; spin_unlock(&sysctl_lock); @@ -514,7 +481,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) { spin_lock(&sysctl_lock); - hlist_del_init_rcu(&PROC_I(inode)->sysctl_inodes); + hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes); if (!--head->count) kfree_rcu(head, rcu); spin_unlock(&sysctl_lock); diff --git a/fs/proc/root.c b/fs/proc/root.c index 608233dfd29c..2633f10446c3 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -292,39 +292,3 @@ struct proc_dir_entry proc_root = { .subdir = RB_ROOT, .name = "/proc", }; - -int pid_ns_prepare_proc(struct pid_namespace *ns) -{ - struct proc_fs_context *ctx; - struct fs_context *fc; - struct vfsmount *mnt; - - fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT); - if (IS_ERR(fc)) - return PTR_ERR(fc); - - if (fc->user_ns != ns->user_ns) { - put_user_ns(fc->user_ns); - fc->user_ns = get_user_ns(ns->user_ns); - } - - ctx = fc->fs_private; - if (ctx->pid_ns != ns) { - put_pid_ns(ctx->pid_ns); - get_pid_ns(ns); - ctx->pid_ns = ns; - } - - mnt = fc_mount(fc); - put_fs_context(fc); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); - - ns->proc_mnt = mnt; - return 0; -} - -void pid_ns_release_proc(struct pid_namespace *ns) -{ - kern_unmount(ns->proc_mnt); -} diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 0449edf460f5..46b3293015fe 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -224,6 +224,7 @@ static int stat_open(struct inode *inode, struct file *file) } static const struct proc_ops stat_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = stat_open, .proc_read = seq_read, .proc_lseek = seq_lseek, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3ba9ae83bff5..8d382d4ec067 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -123,38 +123,14 @@ static void release_task_mempolicy(struct proc_maps_private *priv) } #endif -static void vma_stop(struct proc_maps_private *priv) -{ - struct mm_struct *mm = priv->mm; - - release_task_mempolicy(priv); - up_read(&mm->mmap_sem); - mmput(mm); -} - -static struct vm_area_struct * -m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma) -{ - if (vma == priv->tail_vma) - return NULL; - return vma->vm_next ?: priv->tail_vma; -} - -static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma) -{ - if (m->count < m->size) /* vma is copied successfully */ - m->version = m_next_vma(m->private, vma) ? vma->vm_end : -1UL; -} - static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; - unsigned long last_addr = m->version; + unsigned long last_addr = *ppos; struct mm_struct *mm; struct vm_area_struct *vma; - unsigned int pos = *ppos; - /* See m_cache_vma(). Zero at the start or after lseek. */ + /* See m_next(). Zero at the start or after lseek. */ if (last_addr == -1UL) return NULL; @@ -163,64 +139,59 @@ static void *m_start(struct seq_file *m, loff_t *ppos) return ERR_PTR(-ESRCH); mm = priv->mm; - if (!mm || !mmget_not_zero(mm)) + if (!mm || !mmget_not_zero(mm)) { + put_task_struct(priv->task); + priv->task = NULL; return NULL; + } if (down_read_killable(&mm->mmap_sem)) { mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; return ERR_PTR(-EINTR); } hold_task_mempolicy(priv); priv->tail_vma = get_gate_vma(mm); - if (last_addr) { - vma = find_vma(mm, last_addr - 1); - if (vma && vma->vm_start <= last_addr) - vma = m_next_vma(priv, vma); - if (vma) - return vma; - } - - m->version = 0; - if (pos < mm->map_count) { - for (vma = mm->mmap; pos; pos--) { - m->version = vma->vm_start; - vma = vma->vm_next; - } + vma = find_vma(mm, last_addr); + if (vma) return vma; - } - - /* we do not bother to update m->version in this case */ - if (pos == mm->map_count && priv->tail_vma) - return priv->tail_vma; - vma_stop(priv); - return NULL; + return priv->tail_vma; } -static void *m_next(struct seq_file *m, void *v, loff_t *pos) +static void *m_next(struct seq_file *m, void *v, loff_t *ppos) { struct proc_maps_private *priv = m->private; - struct vm_area_struct *next; + struct vm_area_struct *next, *vma = v; + + if (vma == priv->tail_vma) + next = NULL; + else if (vma->vm_next) + next = vma->vm_next; + else + next = priv->tail_vma; + + *ppos = next ? next->vm_start : -1UL; - (*pos)++; - next = m_next_vma(priv, v); - if (!next) - vma_stop(priv); return next; } static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; + struct mm_struct *mm = priv->mm; - if (!IS_ERR_OR_NULL(v)) - vma_stop(priv); - if (priv->task) { - put_task_struct(priv->task); - priv->task = NULL; - } + if (!priv->task) + return; + + release_task_mempolicy(priv); + up_read(&mm->mmap_sem); + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; } static int proc_maps_open(struct inode *inode, struct file *file, @@ -363,7 +334,6 @@ done: static int show_map(struct seq_file *m, void *v) { show_map_vma(m, v); - m_cache_vma(m, v); return 0; } @@ -847,8 +817,6 @@ static int show_smap(struct seq_file *m, void *v) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); show_smap_vma_flags(m, vma); - m_cache_vma(m, vma); - return 0; } @@ -1887,7 +1855,6 @@ static int show_numa_map(struct seq_file *m, void *v) seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10); out: seq_putc(m, '\n'); - m_cache_vma(m, vma); return 0; } diff --git a/fs/read_write.c b/fs/read_write.c index 59d819c5b92e..bbfa9b12b15e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -331,7 +331,8 @@ COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned i } #endif -#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) +#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \ + defined(__ARCH_WANT_SYS_LLSEEK) SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, unsigned long, offset_low, loff_t __user *, result, unsigned int, whence) diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 4075e41408b4..5129efc6f2e6 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -842,7 +842,7 @@ static void balance_leaf_paste_right_whole(struct tree_balance *tb, struct item_head *pasted; struct buffer_info bi; - buffer_info_init_right(tb, &bi); + buffer_info_init_right(tb, &bi); leaf_shift_right(tb, tb->rnum[0], tb->rbytes); /* append item in R[0] */ diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 45e1a5d11af3..adb21bea3d60 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -184,11 +184,12 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) } /* we need to make sure nobody is changing the file size beneath us */ -{ - int depth = reiserfs_write_unlock_nested(inode->i_sb); - inode_lock(inode); - reiserfs_write_lock_nested(inode->i_sb, depth); -} + { + int depth = reiserfs_write_unlock_nested(inode->i_sb); + + inode_lock(inode); + reiserfs_write_lock_nested(inode->i_sb, depth); + } reiserfs_write_lock(inode->i_sb); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 959a066b7bb0..1594687582f0 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -838,10 +838,10 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode */ INC_DIR_INODE_NLINK(dir) - retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */ , - old_format_only(dir->i_sb) ? - EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, - dentry, inode, &security); + retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */, + old_format_only(dir->i_sb) ? + EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, + dentry, inode, &security); if (retval) { DEC_DIR_INODE_NLINK(dir) goto out_failed; @@ -967,7 +967,7 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) reiserfs_update_sd(&th, inode); DEC_DIR_INODE_NLINK(dir) - dir->i_size -= (DEH_SIZE + de.de_entrylen); + dir->i_size -= (DEH_SIZE + de.de_entrylen); reiserfs_update_sd(&th, dir); /* prevent empty directory from getting lost */ diff --git a/fs/seq_file.c b/fs/seq_file.c index 1600034a929b..70f5fdf99bf6 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -68,13 +68,6 @@ int seq_open(struct file *file, const struct seq_operations *op) p->file = file; /* - * Wrappers around seq_open(e.g. swaps_open) need to be - * aware of this. If they set f_version themselves, they - * should call seq_open first and then set f_version. - */ - file->f_version = 0; - - /* * seq_files support lseek() and pread(). They do not implement * write() at all, but we clear FMODE_PWRITE here for historical * reasons. @@ -94,7 +87,6 @@ static int traverse(struct seq_file *m, loff_t offset) int error = 0; void *p; - m->version = 0; m->index = 0; m->count = m->from = 0; if (!offset) @@ -161,25 +153,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) mutex_lock(&m->lock); /* - * seq_file->op->..m_start/m_stop/m_next may do special actions - * or optimisations based on the file->f_version, so we want to - * pass the file->f_version to those methods. - * - * seq_file->version is just copy of f_version, and seq_file - * methods can treat it simply as file version. - * It is copied in first and copied out after all operations. - * It is convenient to have it as part of structure to avoid the - * need of passing another argument to all the seq_file methods. - */ - m->version = file->f_version; - - /* * if request is to read from zero offset, reset iterator to first * record as it might have been already advanced by previous requests */ if (*ppos == 0) { m->index = 0; - m->version = 0; m->count = 0; } @@ -190,7 +168,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (err) { /* With prejudice... */ m->read_pos = 0; - m->version = 0; m->index = 0; m->count = 0; goto Done; @@ -243,7 +220,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) m->buf = seq_buf_alloc(m->size <<= 1); if (!m->buf) goto Enomem; - m->version = 0; p = m->op->start(m, &m->index); } m->op->stop(m, p); @@ -256,9 +232,12 @@ Fill: loff_t pos = m->index; p = m->op->next(m, p, &m->index); - if (pos == m->index) - /* Buggy ->next function */ + if (pos == m->index) { + pr_info_ratelimited("buggy seq_file .next function %ps " + "did not updated position index\n", + m->op->next); m->index++; + } if (!p || IS_ERR(p)) { err = PTR_ERR(p); break; @@ -287,7 +266,6 @@ Done: *ppos += copied; m->read_pos += copied; } - file->f_version = m->version; mutex_unlock(&m->lock); return copied; Enomem: @@ -313,7 +291,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence) loff_t retval = -EINVAL; mutex_lock(&m->lock); - m->version = file->f_version; switch (whence) { case SEEK_CUR: offset += file->f_pos; @@ -329,7 +306,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence) /* with extreme prejudice... */ file->f_pos = 0; m->read_pos = 0; - m->version = 0; m->index = 0; m->count = 0; } else { @@ -340,7 +316,6 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence) file->f_pos = offset; } } - file->f_version = m->version; mutex_unlock(&m->lock); return retval; } diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 130fc6fbcc03..26bbf960e2a2 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -558,3 +558,151 @@ void sysfs_remove_bin_file(struct kobject *kobj, kernfs_remove_by_name(kobj->sd, attr->attr.name); } EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); + +static int internal_change_owner(struct kernfs_node *kn, kuid_t kuid, + kgid_t kgid) +{ + struct iattr newattrs = { + .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = kuid, + .ia_gid = kgid, + }; + return kernfs_setattr(kn, &newattrs); +} + +/** + * sysfs_link_change_owner - change owner of a sysfs file. + * @kobj: object of the kernfs_node the symlink is located in. + * @targ: object of the kernfs_node the symlink points to. + * @name: name of the link. + * @kuid: new owner's kuid + * @kgid: new owner's kgid + * + * This function looks up the sysfs symlink entry @name under @kobj and changes + * the ownership to @kuid/@kgid. The symlink is looked up in the namespace of + * @targ. + * + * Returns 0 on success or error code on failure. + */ +int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, + const char *name, kuid_t kuid, kgid_t kgid) +{ + struct kernfs_node *kn = NULL; + int error; + + if (!name || !kobj->state_in_sysfs || !targ->state_in_sysfs) + return -EINVAL; + + error = -ENOENT; + kn = kernfs_find_and_get_ns(kobj->sd, name, targ->sd->ns); + if (!kn) + goto out; + + error = -EINVAL; + if (kernfs_type(kn) != KERNFS_LINK) + goto out; + if (kn->symlink.target_kn->priv != targ) + goto out; + + error = internal_change_owner(kn, kuid, kgid); + +out: + kernfs_put(kn); + return error; +} + +/** + * sysfs_file_change_owner - change owner of a sysfs file. + * @kobj: object. + * @name: name of the file to change. + * @kuid: new owner's kuid + * @kgid: new owner's kgid + * + * This function looks up the sysfs entry @name under @kobj and changes the + * ownership to @kuid/@kgid. + * + * Returns 0 on success or error code on failure. + */ +int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, + kgid_t kgid) +{ + struct kernfs_node *kn; + int error; + + if (!name) + return -EINVAL; + + if (!kobj->state_in_sysfs) + return -EINVAL; + + kn = kernfs_find_and_get(kobj->sd, name); + if (!kn) + return -ENOENT; + + error = internal_change_owner(kn, kuid, kgid); + + kernfs_put(kn); + + return error; +} +EXPORT_SYMBOL_GPL(sysfs_file_change_owner); + +/** + * sysfs_change_owner - change owner of the given object. + * @kobj: object. + * @kuid: new owner's kuid + * @kgid: new owner's kgid + * + * Change the owner of the default directory, files, groups, and attributes of + * @kobj to @kuid/@kgid. Note that sysfs_change_owner mirrors how the sysfs + * entries for a kobject are added by driver core. In summary, + * sysfs_change_owner() takes care of the default directory entry for @kobj, + * the default attributes associated with the ktype of @kobj and the default + * attributes associated with the ktype of @kobj. + * Additional properties not added by driver core have to be changed by the + * driver or subsystem which created them. This is similar to how + * driver/subsystem specific entries are removed. + * + * Returns 0 on success or error code on failure. + */ +int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid) +{ + int error; + const struct kobj_type *ktype; + + if (!kobj->state_in_sysfs) + return -EINVAL; + + /* Change the owner of the kobject itself. */ + error = internal_change_owner(kobj->sd, kuid, kgid); + if (error) + return error; + + ktype = get_ktype(kobj); + if (ktype) { + struct attribute **kattr; + + /* + * Change owner of the default attributes associated with the + * ktype of @kobj. + */ + for (kattr = ktype->default_attrs; kattr && *kattr; kattr++) { + error = sysfs_file_change_owner(kobj, (*kattr)->name, + kuid, kgid); + if (error) + return error; + } + + /* + * Change owner of the default groups associated with the + * ktype of @kobj. + */ + error = sysfs_groups_change_owner(kobj, ktype->default_groups, + kuid, kgid); + if (error) + return error; + } + + return 0; +} +EXPORT_SYMBOL_GPL(sysfs_change_owner); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index c4ab045926b7..64e6a6698935 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -13,6 +13,7 @@ #include <linux/dcache.h> #include <linux/namei.h> #include <linux/err.h> +#include <linux/fs.h> #include "sysfs.h" @@ -415,15 +416,18 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group); /** - * __compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing + * compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing * to a group or an attribute * @kobj: The kobject containing the group. * @target_kobj: The target kobject. * @target_name: The name of the target group or attribute. + * @symlink_name: The name of the symlink file (target_name will be + * considered if symlink_name is NULL). */ -int __compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, - struct kobject *target_kobj, - const char *target_name) +int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, + struct kobject *target_kobj, + const char *target_name, + const char *symlink_name) { struct kernfs_node *target; struct kernfs_node *entry; @@ -448,12 +452,129 @@ int __compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, return -ENOENT; } - link = kernfs_create_link(kobj->sd, target_name, entry); + if (!symlink_name) + symlink_name = target_name; + + link = kernfs_create_link(kobj->sd, symlink_name, entry); if (PTR_ERR(link) == -EEXIST) - sysfs_warn_dup(kobj->sd, target_name); + sysfs_warn_dup(kobj->sd, symlink_name); kernfs_put(entry); kernfs_put(target); return PTR_ERR_OR_ZERO(link); } -EXPORT_SYMBOL_GPL(__compat_only_sysfs_link_entry_to_kobj); +EXPORT_SYMBOL_GPL(compat_only_sysfs_link_entry_to_kobj); + +static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn, + const struct attribute_group *grp, + struct iattr *newattrs) +{ + struct kernfs_node *kn; + int error; + + if (grp->attrs) { + struct attribute *const *attr; + + for (attr = grp->attrs; *attr; attr++) { + kn = kernfs_find_and_get(grp_kn, (*attr)->name); + if (!kn) + return -ENOENT; + + error = kernfs_setattr(kn, newattrs); + kernfs_put(kn); + if (error) + return error; + } + } + + if (grp->bin_attrs) { + struct bin_attribute *const *bin_attr; + + for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) { + kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name); + if (!kn) + return -ENOENT; + + error = kernfs_setattr(kn, newattrs); + kernfs_put(kn); + if (error) + return error; + } + } + + return 0; +} + +/** + * sysfs_group_change_owner - change owner of an attribute group. + * @kobj: The kobject containing the group. + * @grp: The attribute group. + * @kuid: new owner's kuid + * @kgid: new owner's kgid + * + * Returns 0 on success or error code on failure. + */ +int sysfs_group_change_owner(struct kobject *kobj, + const struct attribute_group *grp, kuid_t kuid, + kgid_t kgid) +{ + struct kernfs_node *grp_kn; + int error; + struct iattr newattrs = { + .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = kuid, + .ia_gid = kgid, + }; + + if (!kobj->state_in_sysfs) + return -EINVAL; + + if (grp->name) { + grp_kn = kernfs_find_and_get(kobj->sd, grp->name); + } else { + kernfs_get(kobj->sd); + grp_kn = kobj->sd; + } + if (!grp_kn) + return -ENOENT; + + error = kernfs_setattr(grp_kn, &newattrs); + if (!error) + error = sysfs_group_attrs_change_owner(grp_kn, grp, &newattrs); + + kernfs_put(grp_kn); + + return error; +} +EXPORT_SYMBOL_GPL(sysfs_group_change_owner); + +/** + * sysfs_groups_change_owner - change owner of a set of attribute groups. + * @kobj: The kobject containing the groups. + * @groups: The attribute groups. + * @kuid: new owner's kuid + * @kgid: new owner's kgid + * + * Returns 0 on success or error code on failure. + */ +int sysfs_groups_change_owner(struct kobject *kobj, + const struct attribute_group **groups, + kuid_t kuid, kgid_t kgid) +{ + int error = 0, i; + + if (!kobj->state_in_sysfs) + return -EINVAL; + + if (!groups) + return 0; + + for (i = 0; groups[i]; i++) { + error = sysfs_group_change_owner(kobj, groups[i], kuid, kgid); + if (error) + break; + } + + return error; +} +EXPORT_SYMBOL_GPL(sysfs_groups_change_owner); diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 8ceb51478800..7e4bfaf2871f 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -225,7 +225,7 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum) int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, int offs, int quiet, int must_chk_crc) { - int err = -EINVAL, type, node_len; + int err = -EINVAL, type, node_len, dump_node = 1; uint32_t crc, node_crc, magic; const struct ubifs_ch *ch = buf; @@ -278,10 +278,22 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, out_len: if (!quiet) ubifs_err(c, "bad node length %d", node_len); + if (type == UBIFS_DATA_NODE && node_len > UBIFS_DATA_NODE_SZ) + dump_node = 0; out: if (!quiet) { ubifs_err(c, "bad node at LEB %d:%d", lnum, offs); - ubifs_dump_node(c, buf); + if (dump_node) { + ubifs_dump_node(c, buf); + } else { + int safe_len = min3(node_len, c->leb_size - offs, + (int)UBIFS_MAX_DATA_NODE_SZ); + pr_err("\tprevent out-of-bounds memory access\n"); + pr_err("\ttruncated data node length %d\n", safe_len); + pr_err("\tcorrupted data node:\n"); + print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1, + buf, safe_len, 0); + } dump_stack(); } return err; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 3bf8b1fda9d7..e5ec1afe1c66 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -905,6 +905,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) ubifs_err(c, "dead directory entry '%s', error %d", xent->name, err); ubifs_ro_mode(c, err); + kfree(xent); goto out_release; } ubifs_assert(c, ubifs_inode(xino)->xattr); diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index edf43ddd7dce..283f9eb48410 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -157,7 +157,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) int err = 0; ino_t xattr_inum; union ubifs_key key; - struct ubifs_dent_node *xent; + struct ubifs_dent_node *xent, *pxent = NULL; struct fscrypt_name nm = {0}; struct ubifs_orphan *xattr_orphan; struct ubifs_orphan *orphan; @@ -181,11 +181,16 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) xattr_inum = le64_to_cpu(xent->inum); xattr_orphan = orphan_add(c, xattr_inum, orphan); - if (IS_ERR(xattr_orphan)) + if (IS_ERR(xattr_orphan)) { + kfree(xent); return PTR_ERR(xattr_orphan); + } + kfree(pxent); + pxent = xent; key_read(c, &xent->key, &key); } + kfree(pxent); return 0; } @@ -688,14 +693,14 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_key_init(c, &key1, inum); err = ubifs_tnc_lookup(c, &key1, ino); - if (err) + if (err && err != -ENOENT) goto out_free; /* * Check whether an inode can really get deleted. * linkat() with O_TMPFILE allows rebirth of an inode. */ - if (ino->nlink == 0) { + if (err == 0 && ino->nlink == 0) { dbg_rcvry("deleting orphaned inode %lu", (unsigned long)inum); diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h index 3fd85464abd5..736ebc5dc441 100644 --- a/fs/udf/ecma_167.h +++ b/fs/udf/ecma_167.h @@ -5,7 +5,7 @@ * http://www.ecma.ch * * Copyright (c) 2001-2002 Ben Fennema - * Copyright (c) 2017-2019 Pali Rohár <pali.rohar@gmail.com> + * Copyright (c) 2017-2019 Pali Rohár <pali@kernel.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h index 35e61b2cacfe..d5fbfab3ddb6 100644 --- a/fs/udf/osta_udf.h +++ b/fs/udf/osta_udf.h @@ -5,7 +5,7 @@ * http://www.osta.org * * Copyright (c) 2001-2004 Ben Fennema - * Copyright (c) 2017-2019 Pali Rohár <pali.rohar@gmail.com> + * Copyright (c) 2017-2019 Pali Rohár <pali@kernel.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 3d83be54c474..758efe557a19 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -83,7 +83,7 @@ struct udf_virtual_data { struct udf_bitmap { __u32 s_extPosition; int s_nr_groups; - struct buffer_head *s_block_bitmap[0]; + struct buffer_head *s_block_bitmap[]; }; struct udf_part_map { diff --git a/fs/unicode/.gitignore b/fs/unicode/.gitignore index 0381e2221480..9b2467e77b2d 100644 --- a/fs/unicode/.gitignore +++ b/fs/unicode/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only mkutf8data utf8data.h diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 37df7c9eedb1..e39fdec8a0b0 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -314,8 +314,11 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, if (!pmd_present(_pmd)) goto out; - if (pmd_trans_huge(_pmd)) + if (pmd_trans_huge(_pmd)) { + if (!pmd_write(_pmd) && (reason & VM_UFFD_WP)) + ret = true; goto out; + } /* * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it @@ -328,12 +331,38 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, */ if (pte_none(*pte)) ret = true; + if (!pte_write(*pte) && (reason & VM_UFFD_WP)) + ret = true; pte_unmap(pte); out: return ret; } +/* Should pair with userfaultfd_signal_pending() */ +static inline long userfaultfd_get_blocking_state(unsigned int flags) +{ + if (flags & FAULT_FLAG_INTERRUPTIBLE) + return TASK_INTERRUPTIBLE; + + if (flags & FAULT_FLAG_KILLABLE) + return TASK_KILLABLE; + + return TASK_UNINTERRUPTIBLE; +} + +/* Should pair with userfaultfd_get_blocking_state() */ +static inline bool userfaultfd_signal_pending(unsigned int flags) +{ + if (flags & FAULT_FLAG_INTERRUPTIBLE) + return signal_pending(current); + + if (flags & FAULT_FLAG_KILLABLE) + return fatal_signal_pending(current); + + return false; +} + /* * The locking rules involved in returning VM_FAULT_RETRY depending on * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and @@ -355,7 +384,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; vm_fault_t ret = VM_FAULT_SIGBUS; - bool must_wait, return_to_userland; + bool must_wait; long blocking_state; /* @@ -462,11 +491,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) uwq.ctx = ctx; uwq.waken = false; - return_to_userland = - (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == - (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); - blocking_state = return_to_userland ? TASK_INTERRUPTIBLE : - TASK_KILLABLE; + blocking_state = userfaultfd_get_blocking_state(vmf->flags); spin_lock_irq(&ctx->fault_pending_wqh.lock); /* @@ -492,8 +517,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) up_read(&mm->mmap_sem); if (likely(must_wait && !READ_ONCE(ctx->released) && - (return_to_userland ? !signal_pending(current) : - !fatal_signal_pending(current)))) { + !userfaultfd_signal_pending(vmf->flags))) { wake_up_poll(&ctx->fd_wqh, EPOLLIN); schedule(); ret |= VM_FAULT_MAJOR; @@ -515,8 +539,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) set_current_state(blocking_state); if (READ_ONCE(uwq.waken) || READ_ONCE(ctx->released) || - (return_to_userland ? signal_pending(current) : - fatal_signal_pending(current))) + userfaultfd_signal_pending(vmf->flags)) break; schedule(); } @@ -524,30 +547,6 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) __set_current_state(TASK_RUNNING); - if (return_to_userland) { - if (signal_pending(current) && - !fatal_signal_pending(current)) { - /* - * If we got a SIGSTOP or SIGCONT and this is - * a normal userland page fault, just let - * userland return so the signal will be - * handled and gdb debugging works. The page - * fault code immediately after we return from - * this function is going to release the - * mmap_sem and it's not depending on it - * (unlike gup would if we were not to return - * VM_FAULT_RETRY). - * - * If a fatal signal is pending we still take - * the streamlined VM_FAULT_RETRY failure path - * and there's no need to retake the mmap_sem - * in such case. - */ - down_read(&mm->mmap_sem); - ret = VM_FAULT_NOPAGE; - } - } - /* * Here we race with the list_del; list_add in * userfaultfd_ctx_read(), however because we don't ever run @@ -1293,10 +1292,13 @@ static __always_inline int validate_range(struct mm_struct *mm, return 0; } -static inline bool vma_can_userfault(struct vm_area_struct *vma) +static inline bool vma_can_userfault(struct vm_area_struct *vma, + unsigned long vm_flags) { - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); + /* FIXME: add WP support to hugetlbfs and shmem */ + return vma_is_anonymous(vma) || + ((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) && + !(vm_flags & VM_UFFD_WP)); } static int userfaultfd_register(struct userfaultfd_ctx *ctx, @@ -1328,15 +1330,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vm_flags = 0; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) vm_flags |= VM_UFFD_MISSING; - if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { + if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) vm_flags |= VM_UFFD_WP; - /* - * FIXME: remove the below error constraint by - * implementing the wprotect tracking mode. - */ - ret = -EINVAL; - goto out; - } ret = validate_range(mm, &uffdio_register.range.start, uffdio_register.range.len); @@ -1386,7 +1381,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, /* check not compatible vmas */ ret = -EINVAL; - if (!vma_can_userfault(cur)) + if (!vma_can_userfault(cur, vm_flags)) goto out_unlock; /* @@ -1414,6 +1409,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, if (end & (vma_hpagesize - 1)) goto out_unlock; } + if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE)) + goto out_unlock; /* * Check that this vma isn't already owned by a @@ -1443,7 +1440,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(!vma_can_userfault(vma)); + BUG_ON(!vma_can_userfault(vma, vm_flags)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); @@ -1498,14 +1495,24 @@ out_unlock: up_write(&mm->mmap_sem); mmput(mm); if (!ret) { + __u64 ioctls_out; + + ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : + UFFD_API_RANGE_IOCTLS; + + /* + * Declare the WP ioctl only if the WP mode is + * specified and all checks passed with the range + */ + if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)) + ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT); + /* * Now that we scanned all vmas we can already tell * userland which ioctls methods are guaranteed to * succeed on this range. */ - if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : - UFFD_API_RANGE_IOCTLS, - &user_uffdio_register->ioctls)) + if (put_user(ioctls_out, &user_uffdio_register->ioctls)) ret = -EFAULT; } out: @@ -1581,7 +1588,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * provides for more strict behavior to notice * unregistration errors. */ - if (!vma_can_userfault(cur)) + if (!vma_can_userfault(cur, cur->vm_flags)) goto out_unlock; found = true; @@ -1595,7 +1602,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, do { cond_resched(); - BUG_ON(!vma_can_userfault(vma)); + BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); /* * Nothing to do: this vma is already registered into this @@ -1730,11 +1737,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, ret = -EINVAL; if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) goto out; - if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE) + if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) goto out; if (mmget_not_zero(ctx->mm)) { ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len, &ctx->mmap_changing); + uffdio_copy.len, &ctx->mmap_changing, + uffdio_copy.mode); mmput(ctx->mm); } else { return -ESRCH; @@ -1807,6 +1815,53 @@ out: return ret; } +static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, + unsigned long arg) +{ + int ret; + struct uffdio_writeprotect uffdio_wp; + struct uffdio_writeprotect __user *user_uffdio_wp; + struct userfaultfd_wake_range range; + bool mode_wp, mode_dontwake; + + if (READ_ONCE(ctx->mmap_changing)) + return -EAGAIN; + + user_uffdio_wp = (struct uffdio_writeprotect __user *) arg; + + if (copy_from_user(&uffdio_wp, user_uffdio_wp, + sizeof(struct uffdio_writeprotect))) + return -EFAULT; + + ret = validate_range(ctx->mm, &uffdio_wp.range.start, + uffdio_wp.range.len); + if (ret) + return ret; + + if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE | + UFFDIO_WRITEPROTECT_MODE_WP)) + return -EINVAL; + + mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP; + mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE; + + if (mode_wp && mode_dontwake) + return -EINVAL; + + ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, + uffdio_wp.range.len, mode_wp, + &ctx->mmap_changing); + if (ret) + return ret; + + if (!mode_wp && !mode_dontwake) { + range.start = uffdio_wp.range.start; + range.len = uffdio_wp.range.len; + wake_userfault(ctx, &range); + } + return ret; +} + static inline unsigned int uffd_ctx_features(__u64 user_features) { /* @@ -1888,6 +1943,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, case UFFDIO_ZEROPAGE: ret = userfaultfd_zeropage(ctx, arg); break; + case UFFDIO_WRITEPROTECT: + ret = userfaultfd_writeprotect(ctx, arg); + break; } return ret; } diff --git a/fs/xattr.c b/fs/xattr.c index 90dd78f0eb27..e13265e65871 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -817,7 +817,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) if (len < sizeof(*new_xattr)) return NULL; - new_xattr = kmalloc(len, GFP_KERNEL); + new_xattr = kvmalloc(len, GFP_KERNEL); if (!new_xattr) return NULL; @@ -860,6 +860,7 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, * @value: value of the xattr. If %NULL, will remove the attribute. * @size: size of the new xattr * @flags: %XATTR_{CREATE|REPLACE} + * @removed_size: returns size of the removed xattr, -1 if none removed * * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist; @@ -868,7 +869,8 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, * Returns 0 on success, -errno on failure. */ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, - const void *value, size_t size, int flags) + const void *value, size_t size, int flags, + ssize_t *removed_size) { struct simple_xattr *xattr; struct simple_xattr *new_xattr = NULL; @@ -882,7 +884,7 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, new_xattr->name = kstrdup(name, GFP_KERNEL); if (!new_xattr->name) { - kfree(new_xattr); + kvfree(new_xattr); return -ENOMEM; } } @@ -895,8 +897,12 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, err = -EEXIST; } else if (new_xattr) { list_replace(&xattr->list, &new_xattr->list); + if (removed_size) + *removed_size = xattr->size; } else { list_del(&xattr->list); + if (removed_size) + *removed_size = xattr->size; } goto out; } @@ -908,11 +914,14 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, list_add(&new_xattr->list, &xattrs->head); xattr = NULL; } + + if (removed_size) + *removed_size = -1; out: spin_unlock(&xattrs->lock); if (xattr) { kfree(xattr->name); - kfree(xattr); + kvfree(xattr); } return err; diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index aceca2f9a3db..4f95df476181 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -26,6 +26,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_bmap.o \ xfs_bmap_btree.o \ xfs_btree.o \ + xfs_btree_staging.o \ xfs_da_btree.o \ xfs_defer.o \ xfs_dir2.o \ diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 08d6beb54f8c..9d84007a5c65 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -231,7 +231,7 @@ xfs_sbblock_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; xfs_sb_to_disk(dsb, &mp->m_sb); dsb->sb_inprogress = 1; @@ -243,7 +243,7 @@ xfs_agfblock_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); + struct xfs_agf *agf = bp->b_addr; xfs_extlen_t tmpsize; agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); @@ -301,7 +301,7 @@ xfs_agflblock_init( uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); } - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); + agfl_bno = xfs_buf_to_agfl_bno(bp); for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++) agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); } @@ -312,7 +312,7 @@ xfs_agiblock_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + struct xfs_agi *agi = bp->b_addr; int bucket; agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); @@ -502,7 +502,7 @@ xfs_ag_extend_space( if (error) return error; - agi = XFS_BUF_TO_AGI(bp); + agi = bp->b_addr; be32_add_cpu(&agi->agi_length, len); ASSERT(id->agno == mp->m_sb.sb_agcount - 1 || be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks); @@ -515,7 +515,7 @@ xfs_ag_extend_space( if (error) return error; - agf = XFS_BUF_TO_AGF(bp); + agf = bp->b_addr; be32_add_cpu(&agf->agf_length, len); ASSERT(agf->agf_length == agi->agi_length); xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); @@ -569,11 +569,11 @@ xfs_ag_get_geometry( memset(ageo, 0, sizeof(*ageo)); ageo->ag_number = agno; - agi = XFS_BUF_TO_AGI(agi_bp); + agi = agi_bp->b_addr; ageo->ag_icount = be32_to_cpu(agi->agi_count); ageo->ag_ifree = be32_to_cpu(agi->agi_freecount); - agf = XFS_BUF_TO_AGF(agf_bp); + agf = agf_bp->b_addr; ageo->ag_length = be32_to_cpu(agf->agf_length); freeblks = pag->pagf_freeblks + pag->pagf_flcount + diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index d8053bc96c4d..203e74fa64aa 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -151,7 +151,7 @@ xfs_alloc_lookup_eq( cur->bc_rec.a.ar_startblock = bno; cur->bc_rec.a.ar_blockcount = len; error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); - cur->bc_private.a.priv.abt.active = (*stat == 1); + cur->bc_ag.abt.active = (*stat == 1); return error; } @@ -171,7 +171,7 @@ xfs_alloc_lookup_ge( cur->bc_rec.a.ar_startblock = bno; cur->bc_rec.a.ar_blockcount = len; error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); - cur->bc_private.a.priv.abt.active = (*stat == 1); + cur->bc_ag.abt.active = (*stat == 1); return error; } @@ -190,7 +190,7 @@ xfs_alloc_lookup_le( cur->bc_rec.a.ar_startblock = bno; cur->bc_rec.a.ar_blockcount = len; error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); - cur->bc_private.a.priv.abt.active = (*stat == 1); + cur->bc_ag.abt.active = (*stat == 1); return error; } @@ -198,7 +198,7 @@ static inline bool xfs_alloc_cur_active( struct xfs_btree_cur *cur) { - return cur && cur->bc_private.a.priv.abt.active; + return cur && cur->bc_ag.abt.active; } /* @@ -230,7 +230,7 @@ xfs_alloc_get_rec( int *stat) /* output: success/failure */ { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + xfs_agnumber_t agno = cur->bc_ag.agno; union xfs_btree_rec *rec; int error; @@ -589,6 +589,7 @@ xfs_agfl_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); + __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp); int i; /* @@ -614,8 +615,8 @@ xfs_agfl_verify( return __this_address; for (i = 0; i < xfs_agfl_size(mp); i++) { - if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK && - be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) + if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK && + be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks) return __this_address; } @@ -713,7 +714,7 @@ xfs_alloc_update_counters( struct xfs_buf *agbp, long len) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_agf *agf = agbp->b_addr; pag->pagf_freeblks += len; be32_add_cpu(&agf->agf_freeblks, len); @@ -721,7 +722,7 @@ xfs_alloc_update_counters( xfs_trans_agblocks_delta(tp, len); if (unlikely(be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))) { - xfs_buf_corruption_error(agbp); + xfs_buf_mark_corrupt(agbp); return -EFSCORRUPTED; } @@ -907,7 +908,7 @@ xfs_alloc_cur_check( deactivate = true; out: if (deactivate) - cur->bc_private.a.priv.abt.active = false; + cur->bc_ag.abt.active = false; trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff, *new); return 0; @@ -922,13 +923,13 @@ xfs_alloc_cur_finish( struct xfs_alloc_arg *args, struct xfs_alloc_cur *acur) { + struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; int error; ASSERT(acur->cnt && acur->bnolt); ASSERT(acur->bno >= acur->rec_bno); ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); - ASSERT(acur->rec_bno + acur->rec_len <= - be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length)); error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, acur->rec_len, acur->bno, acur->len, 0); @@ -1026,6 +1027,7 @@ xfs_alloc_ag_vextent_small( xfs_extlen_t *flenp, /* result length */ int *stat) /* status: 0-freelist, 1-normal/none */ { + struct xfs_agf *agf = args->agbp->b_addr; int error = 0; xfs_agblock_t fbno = NULLAGBLOCK; xfs_extlen_t flen = 0; @@ -1054,8 +1056,7 @@ xfs_alloc_ag_vextent_small( if (args->minlen != 1 || args->alignment != 1 || args->resv == XFS_AG_RESV_AGFL || - (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <= - args->minleft)) + be32_to_cpu(agf->agf_flcount) <= args->minleft) goto out; error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); @@ -1079,9 +1080,7 @@ xfs_alloc_ag_vextent_small( } *fbnop = args->agbno = fbno; *flenp = args->len = 1; - if (XFS_IS_CORRUPT(args->mp, - fbno >= be32_to_cpu( - XFS_BUF_TO_AGF(args->agbp)->agf_length))) { + if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) { error = -EFSCORRUPTED; goto error; } @@ -1203,6 +1202,7 @@ STATIC int /* error */ xfs_alloc_ag_vextent_exact( xfs_alloc_arg_t *args) /* allocation argument structure */ { + struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ int error; @@ -1281,8 +1281,7 @@ xfs_alloc_ag_vextent_exact( */ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, args->agno, XFS_BTNUM_CNT); - ASSERT(args->agbno + args->len <= - be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, args->len, XFSA_FIXUP_BNO_OK); if (error) { @@ -1353,7 +1352,7 @@ xfs_alloc_walk_iter( if (error) return error; if (i == 0) - cur->bc_private.a.priv.abt.active = false; + cur->bc_ag.abt.active = false; if (count > 0) count--; @@ -1468,7 +1467,7 @@ xfs_alloc_ag_vextent_locality( if (error) return error; if (i) { - acur->cnt->bc_private.a.priv.abt.active = true; + acur->cnt->bc_ag.abt.active = true; fbcur = acur->cnt; fbinc = false; } @@ -1515,7 +1514,7 @@ xfs_alloc_ag_vextent_lastblock( * maxlen, go to the start of this block, and skip all those smaller * than minlen. */ - if (len || args->alignment > 1) { + if (*len || args->alignment > 1) { acur->cnt->bc_ptrs[0] = 1; do { error = xfs_alloc_get_rec(acur->cnt, bno, len, &i); @@ -1661,6 +1660,7 @@ STATIC int /* error */ xfs_alloc_ag_vextent_size( xfs_alloc_arg_t *args) /* allocation argument structure */ { + struct xfs_agf *agf = args->agbp->b_addr; xfs_btree_cur_t *bno_cur; /* cursor for bno btree */ xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */ int error; /* error result */ @@ -1851,8 +1851,7 @@ restart: args->agbno = rbno; if (XFS_IS_CORRUPT(args->mp, args->agbno + args->len > - be32_to_cpu( - XFS_BUF_TO_AGF(args->agbp)->agf_length))) { + be32_to_cpu(agf->agf_length))) { error = -EFSCORRUPTED; goto error0; } @@ -2424,7 +2423,7 @@ xfs_agfl_reset( struct xfs_perag *pag) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_agf *agf = agbp->b_addr; ASSERT(pag->pagf_agflreset); trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_); @@ -2655,7 +2654,7 @@ xfs_alloc_get_freelist( xfs_agblock_t *bnop, /* block address retrieved from freelist */ int btreeblk) /* destination is a AGF btree */ { - xfs_agf_t *agf; /* a.g. freespace structure */ + struct xfs_agf *agf = agbp->b_addr; xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ xfs_agblock_t bno; /* block number returned */ __be32 *agfl_bno; @@ -2667,7 +2666,6 @@ xfs_alloc_get_freelist( /* * Freelist is empty, give up. */ - agf = XFS_BUF_TO_AGF(agbp); if (!agf->agf_flcount) { *bnop = NULLAGBLOCK; return 0; @@ -2684,7 +2682,7 @@ xfs_alloc_get_freelist( /* * Get the block number and update the data structures. */ - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + agfl_bno = xfs_buf_to_agfl_bno(agflbp); bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); be32_add_cpu(&agf->agf_flfirst, 1); xfs_trans_brelse(tp, agflbp); @@ -2745,7 +2743,7 @@ xfs_alloc_log_agf( sizeof(xfs_agf_t) }; - trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_); + trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); @@ -2783,18 +2781,15 @@ xfs_alloc_put_freelist( xfs_agblock_t bno, /* block being freed */ int btreeblk) /* block came from a AGF btree */ { - xfs_agf_t *agf; /* a.g. freespace structure */ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agf *agf = agbp->b_addr; __be32 *blockp;/* pointer to array entry */ int error; int logflags; - xfs_mount_t *mp; /* mount structure */ xfs_perag_t *pag; /* per allocation group data */ __be32 *agfl_bno; int startoff; - agf = XFS_BUF_TO_AGF(agbp); - mp = tp->t_mountp; - if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), &agflbp))) return error; @@ -2820,7 +2815,7 @@ xfs_alloc_put_freelist( ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + agfl_bno = xfs_buf_to_agfl_bno(agflbp); blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; *blockp = cpu_to_be32(bno); startoff = (char *)blockp - (char *)agflbp->b_addr; @@ -2838,13 +2833,12 @@ xfs_agf_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; - struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); + struct xfs_agf *agf = bp->b_addr; if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (!xfs_log_check_lsn(mp, - be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn))) + if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn))) return __this_address; } @@ -2858,6 +2852,13 @@ xfs_agf_verify( be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp))) return __this_address; + if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks) + return __this_address; + + if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) || + be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length)) + return __this_address; + if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || @@ -2869,6 +2870,10 @@ xfs_agf_verify( be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)) return __this_address; + if (xfs_sb_version_hasrmapbt(&mp->m_sb) && + be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length)) + return __this_address; + /* * during growfs operations, the perag is not fully initialised, * so we can't use it for any useful checking. growfs ensures we can't @@ -2883,6 +2888,11 @@ xfs_agf_verify( return __this_address; if (xfs_sb_version_hasreflink(&mp->m_sb) && + be32_to_cpu(agf->agf_refcount_blocks) > + be32_to_cpu(agf->agf_length)) + return __this_address; + + if (xfs_sb_version_hasreflink(&mp->m_sb) && (be32_to_cpu(agf->agf_refcount_level) < 1 || be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)) return __this_address; @@ -2914,6 +2924,7 @@ xfs_agf_write_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; + struct xfs_agf *agf = bp->b_addr; xfs_failaddr_t fa; fa = xfs_agf_verify(bp); @@ -2926,7 +2937,7 @@ xfs_agf_write_verify( return; if (bip) - XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); + agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); } @@ -2994,7 +3005,7 @@ xfs_alloc_read_agf( return error; ASSERT(!(*bpp)->b_error); - agf = XFS_BUF_TO_AGF(*bpp); + agf = (*bpp)->b_addr; pag = xfs_perag_get(mp, agno); if (!pag->pagf_init) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); @@ -3275,6 +3286,7 @@ __xfs_free_extent( struct xfs_buf *agbp; xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); + struct xfs_agf *agf; int error; unsigned int busy_flags = 0; @@ -3288,6 +3300,7 @@ __xfs_free_extent( error = xfs_free_extent_fix_freelist(tp, agno, &agbp); if (error) return error; + agf = agbp->b_addr; if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { error = -EFSCORRUPTED; @@ -3295,9 +3308,7 @@ __xfs_free_extent( } /* validate the extent size is legal now we have the agf locked */ - if (XFS_IS_CORRUPT(mp, - agbno + len > - be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length))) { + if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) { error = -EFSCORRUPTED; goto err; } @@ -3408,7 +3419,7 @@ xfs_agfl_walk( unsigned int i; int error; - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + agfl_bno = xfs_buf_to_agfl_bno(agflbp); i = be32_to_cpu(agf->agf_flfirst); /* Nothing to walk in an empty AGFL. */ diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 7380fbe4a3ff..a851bf77f17b 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -236,4 +236,13 @@ typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno, int xfs_agfl_walk(struct xfs_mount *mp, struct xfs_agf *agf, struct xfs_buf *agflbp, xfs_agfl_walk_fn walk_fn, void *priv); +static inline __be32 * +xfs_buf_to_agfl_bno( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_mount->m_sb)) + return bp->b_addr + sizeof(struct xfs_agfl); + return bp->b_addr; +} + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 279694d73e4e..60c453cb3ee3 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -12,6 +12,7 @@ #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" @@ -25,7 +26,7 @@ xfs_allocbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_ag.agbp, cur->bc_ag.agno, cur->bc_btnum); } @@ -35,8 +36,8 @@ xfs_allocbt_set_root( union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); int btnum = cur->bc_btnum; struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); @@ -62,7 +63,7 @@ xfs_allocbt_alloc_block( xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp, &bno, 1); if (error) return error; @@ -72,7 +73,7 @@ xfs_allocbt_alloc_block( return 0; } - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false); xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); @@ -86,8 +87,8 @@ xfs_allocbt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_agblock_t bno; int error; @@ -113,7 +114,7 @@ xfs_allocbt_update_lastrec( int ptr, int reason) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); struct xfs_perag *pag; __be32 len; @@ -162,7 +163,7 @@ xfs_allocbt_update_lastrec( pag = xfs_perag_get(cur->bc_mp, seqno); pag->pagf_longest = be32_to_cpu(len); xfs_perag_put(pag); - xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST); + xfs_alloc_log_agf(cur->bc_tp, cur->bc_ag.agbp, XFS_AGF_LONGEST); } STATIC int @@ -226,9 +227,9 @@ xfs_allocbt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_roots[cur->bc_btnum]; } @@ -471,18 +472,14 @@ static const struct xfs_btree_ops xfs_cntbt_ops = { .recs_inorder = xfs_cntbt_recs_inorder, }; -/* - * Allocate a new allocation btree cursor. - */ -struct xfs_btree_cur * /* new alloc btree cursor */ -xfs_allocbt_init_cursor( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer for agf structure */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_btnum_t btnum) /* btree identifier */ +/* Allocate most of a new allocation btree cursor. */ +STATIC struct xfs_btree_cur * +xfs_allocbt_init_common( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_btnum_t btnum) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); struct xfs_btree_cur *cur; ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); @@ -495,19 +492,16 @@ xfs_allocbt_init_cursor( cur->bc_blocklog = mp->m_sb.sb_blocklog; if (btnum == XFS_BTNUM_CNT) { - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); cur->bc_ops = &xfs_cntbt_ops; - cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); cur->bc_flags = XFS_BTREE_LASTREC_UPDATE; } else { - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); cur->bc_ops = &xfs_bnobt_ops; - cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); } - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; - cur->bc_private.a.priv.abt.active = false; + cur->bc_ag.agno = agno; + cur->bc_ag.abt.active = false; if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; @@ -516,6 +510,73 @@ xfs_allocbt_init_cursor( } /* + * Allocate a new allocation btree cursor. + */ +struct xfs_btree_cur * /* new alloc btree cursor */ +xfs_allocbt_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer for agf structure */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_btnum_t btnum) /* btree identifier */ +{ + struct xfs_agf *agf = agbp->b_addr; + struct xfs_btree_cur *cur; + + cur = xfs_allocbt_init_common(mp, tp, agno, btnum); + if (btnum == XFS_BTNUM_CNT) + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + else + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); + + cur->bc_ag.agbp = agbp; + + return cur; +} + +/* Create a free space btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_allocbt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + xfs_agnumber_t agno, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + + cur = xfs_allocbt_init_common(mp, NULL, agno, btnum); + xfs_btree_stage_afakeroot(cur, afake); + return cur; +} + +/* + * Install a new free space btree root. Caller is responsible for invalidating + * and freeing the old btree blocks. + */ +void +xfs_allocbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root); + agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); + + if (cur->bc_btnum == XFS_BTNUM_BNO) { + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_bnobt_ops); + } else { + cur->bc_flags |= XFS_BTREE_LASTREC_UPDATE; + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_cntbt_ops); + } +} + +/* * Calculate number of records in an alloc btree block. */ int diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index c9305ebb69f6..047f09f0be3c 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -13,6 +13,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xbtree_afakeroot; /* * Btree block header size depends on a superblock flag. @@ -48,8 +49,14 @@ struct xfs_mount; extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, xfs_btnum_t); +struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, xfs_agnumber_t agno, + xfs_btnum_t btnum); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp, unsigned long long len); +void xfs_allocbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); + #endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index e6149720ce02..e4fe3dca9883 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -56,33 +56,6 @@ STATIC int xfs_attr_node_removename(xfs_da_args_t *args); STATIC int xfs_attr_fillstate(xfs_da_state_t *state); STATIC int xfs_attr_refillstate(xfs_da_state_t *state); - -STATIC int -xfs_attr_args_init( - struct xfs_da_args *args, - struct xfs_inode *dp, - const unsigned char *name, - size_t namelen, - int flags) -{ - - if (!name) - return -EINVAL; - - memset(args, 0, sizeof(*args)); - args->geo = dp->i_mount->m_attr_geo; - args->whichfork = XFS_ATTR_FORK; - args->dp = dp; - args->flags = flags; - args->name = name; - args->namelen = namelen; - if (args->namelen >= MAXNAMELEN) - return -EFAULT; /* match IRIX behaviour */ - - args->hashval = xfs_da_hashname(args->name, args->namelen); - return 0; -} - int xfs_inode_hasattr( struct xfs_inode *ip) @@ -104,85 +77,60 @@ xfs_inode_hasattr( */ int xfs_attr_get_ilocked( - struct xfs_inode *ip, struct xfs_da_args *args) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT(xfs_isilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - if (!xfs_inode_hasattr(ip)) + if (!xfs_inode_hasattr(args->dp)) return -ENOATTR; - else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) + + if (args->dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_getvalue(args); - else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) + if (xfs_bmap_one_block(args->dp, XFS_ATTR_FORK)) return xfs_attr_leaf_get(args); - else - return xfs_attr_node_get(args); + return xfs_attr_node_get(args); } /* * Retrieve an extended attribute by name, and its value if requested. * - * If ATTR_KERNOVAL is set in @flags, then the caller does not want the value, - * just an indication whether the attribute exists and the size of the value if - * it exists. The size is returned in @valuelenp, + * If args->valuelen is zero, then the caller does not want the value, just an + * indication whether the attribute exists and the size of the value if it + * exists. The size is returned in args.valuelen. * - * If the attribute is found, but exceeds the size limit set by the caller in - * @valuelenp, return -ERANGE with the size of the attribute that was found in - * @valuelenp. + * If args->value is NULL but args->valuelen is non-zero, allocate the buffer + * for the value after existence of the attribute has been determined. The + * caller always has to free args->value if it is set, no matter if this + * function was successful or not. * - * If ATTR_ALLOC is set in @flags, allocate the buffer for the value after - * existence of the attribute has been determined. On success, return that - * buffer to the caller and leave them to free it. On failure, free any - * allocated buffer and ensure the buffer pointer returned to the caller is - * null. + * If the attribute is found, but exceeds the size limit set by the caller in + * args->valuelen, return -ERANGE with the size of the attribute that was found + * in args->valuelen. */ int xfs_attr_get( - struct xfs_inode *ip, - const unsigned char *name, - size_t namelen, - unsigned char **value, - int *valuelenp, - int flags) + struct xfs_da_args *args) { - struct xfs_da_args args; uint lock_mode; int error; - ASSERT((flags & (ATTR_ALLOC | ATTR_KERNOVAL)) || *value); - - XFS_STATS_INC(ip->i_mount, xs_attr_get); + XFS_STATS_INC(args->dp->i_mount, xs_attr_get); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (XFS_FORCED_SHUTDOWN(args->dp->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, ip, name, namelen, flags); - if (error) - return error; + args->geo = args->dp->i_mount->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->hashval = xfs_da_hashname(args->name, args->namelen); /* Entirely possible to look up a name which doesn't exist */ - args.op_flags = XFS_DA_OP_OKNOENT; - if (flags & ATTR_ALLOC) - args.op_flags |= XFS_DA_OP_ALLOCVAL; - else - args.value = *value; - args.valuelen = *valuelenp; + args->op_flags = XFS_DA_OP_OKNOENT; - lock_mode = xfs_ilock_attr_map_shared(ip); - error = xfs_attr_get_ilocked(ip, &args); - xfs_iunlock(ip, lock_mode); - *valuelenp = args.valuelen; + lock_mode = xfs_ilock_attr_map_shared(args->dp); + error = xfs_attr_get_ilocked(args); + xfs_iunlock(args->dp, lock_mode); - /* on error, we have to clean up allocated value buffers */ - if (error) { - if (flags & ATTR_ALLOC) { - kmem_free(args.value); - *value = NULL; - } - return error; - } - *value = args.value; - return 0; + return error; } /* @@ -238,7 +186,7 @@ xfs_attr_try_sf_addname( * Commit the shortform mods, and we're done. * NOTE: this is also the error path (EEXIST, etc). */ - if (!error && (args->flags & ATTR_KERNOTIME) == 0) + if (!error && !(args->op_flags & XFS_DA_OP_NOTIME)) xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); if (mp->m_flags & XFS_MOUNT_WSYNC) @@ -336,188 +284,127 @@ xfs_attr_remove_args( return error; } +/* + * Note: If args->value is NULL the attribute will be removed, just like the + * Linux ->setattr API. + */ int xfs_attr_set( - struct xfs_inode *dp, - const unsigned char *name, - size_t namelen, - unsigned char *value, - int valuelen, - int flags) + struct xfs_da_args *args) { + struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; - struct xfs_da_args args; struct xfs_trans_res tres; - int rsvd = (flags & ATTR_ROOT) != 0; + bool rsvd = (args->attr_filter & XFS_ATTR_ROOT); int error, local; - - XFS_STATS_INC(mp, xs_attr_set); + unsigned int total; if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - error = xfs_attr_args_init(&args, dp, name, namelen, flags); - if (error) - return error; - - args.value = value; - args.valuelen = valuelen; - args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; - args.total = xfs_attr_calc_size(&args, &local); - error = xfs_qm_dqattach(dp); if (error) return error; - /* - * If the inode doesn't have an attribute fork, add one. - * (inode must not be locked when we call this routine) - */ - if (XFS_IFORK_Q(dp) == 0) { - int sf_size = sizeof(xfs_attr_sf_hdr_t) + - XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen); - - error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); - if (error) - return error; - } - - tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * args.total; - tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - - /* - * Root fork attributes can use reserved data blocks for this - * operation if necessary - */ - error = xfs_trans_alloc(mp, &tres, args.total, 0, - rsvd ? XFS_TRANS_RESERVE : 0, &args.trans); - if (error) - return error; - - xfs_ilock(dp, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0, - rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : - XFS_QMOPT_RES_REGBLKS); - if (error) - goto out_trans_cancel; - - xfs_trans_ijoin(args.trans, dp, 0); - error = xfs_attr_set_args(&args); - if (error) - goto out_trans_cancel; - if (!args.trans) { - /* shortform attribute has already been committed */ - goto out_unlock; - } - - /* - * If this is a synchronous mount, make sure that the - * transaction goes to disk before returning to the user. - */ - if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(args.trans); - - if ((flags & ATTR_KERNOTIME) == 0) - xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); + args->geo = mp->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->hashval = xfs_da_hashname(args->name, args->namelen); /* - * Commit the last in the sequence of transactions. + * We have no control over the attribute names that userspace passes us + * to remove, so we have to allow the name lookup prior to attribute + * removal to fail as well. */ - xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans); -out_unlock: - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; + args->op_flags = XFS_DA_OP_OKNOENT; -out_trans_cancel: - if (args.trans) - xfs_trans_cancel(args.trans); - goto out_unlock; -} + if (args->value) { + XFS_STATS_INC(mp, xs_attr_set); -/* - * Generic handler routine to remove a name from an attribute list. - * Transitions attribute list from Btree to shortform as necessary. - */ -int -xfs_attr_remove( - struct xfs_inode *dp, - const unsigned char *name, - size_t namelen, - int flags) -{ - struct xfs_mount *mp = dp->i_mount; - struct xfs_da_args args; - int error; - - XFS_STATS_INC(mp, xs_attr_remove); + args->op_flags |= XFS_DA_OP_ADDNAME; + args->total = xfs_attr_calc_size(args, &local); - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return -EIO; + /* + * If the inode doesn't have an attribute fork, add one. + * (inode must not be locked when we call this routine) + */ + if (XFS_IFORK_Q(dp) == 0) { + int sf_size = sizeof(struct xfs_attr_sf_hdr) + + XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, + args->valuelen); - error = xfs_attr_args_init(&args, dp, name, namelen, flags); - if (error) - return error; + error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); + if (error) + return error; + } - /* - * we have no control over the attribute names that userspace passes us - * to remove, so we have to allow the name lookup prior to attribute - * removal to fail. - */ - args.op_flags = XFS_DA_OP_OKNOENT; + tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * + args->total; + tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + total = args->total; + } else { + XFS_STATS_INC(mp, xs_attr_remove); - error = xfs_qm_dqattach(dp); - if (error) - return error; + tres = M_RES(mp)->tr_attrrm; + total = XFS_ATTRRM_SPACE_RES(mp); + } /* * Root fork attributes can use reserved data blocks for this * operation if necessary */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrrm, - XFS_ATTRRM_SPACE_RES(mp), 0, - (flags & ATTR_ROOT) ? XFS_TRANS_RESERVE : 0, - &args.trans); + error = xfs_trans_alloc(mp, &tres, total, 0, + rsvd ? XFS_TRANS_RESERVE : 0, &args->trans); if (error) return error; xfs_ilock(dp, XFS_ILOCK_EXCL); - /* - * No need to make quota reservations here. We expect to release some - * blocks not allocate in the common case. - */ - xfs_trans_ijoin(args.trans, dp, 0); - - error = xfs_attr_remove_args(&args); - if (error) - goto out; + xfs_trans_ijoin(args->trans, dp, 0); + if (args->value) { + unsigned int quota_flags = XFS_QMOPT_RES_REGBLKS; + + if (rsvd) + quota_flags |= XFS_QMOPT_FORCE_RES; + error = xfs_trans_reserve_quota_nblks(args->trans, dp, + args->total, 0, quota_flags); + if (error) + goto out_trans_cancel; + error = xfs_attr_set_args(args); + if (error) + goto out_trans_cancel; + /* shortform attribute has already been committed */ + if (!args->trans) + goto out_unlock; + } else { + error = xfs_attr_remove_args(args); + if (error) + goto out_trans_cancel; + } /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(args.trans); + xfs_trans_set_sync(args->trans); - if ((flags & ATTR_KERNOTIME) == 0) - xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); + if (!(args->op_flags & XFS_DA_OP_NOTIME)) + xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); /* * Commit the last in the sequence of transactions. */ - xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); + error = xfs_trans_commit(args->trans); +out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; -out: - if (args.trans) - xfs_trans_cancel(args.trans); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; +out_trans_cancel: + if (args->trans) + xfs_trans_cancel(args->trans); + goto out_unlock; } /*======================================================================== @@ -536,10 +423,10 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) trace_xfs_attr_sf_addname(args); retval = xfs_attr_shortform_lookup(args); - if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { + if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) return retval; - } else if (retval == -EEXIST) { - if (args->flags & ATTR_CREATE) + if (retval == -EEXIST) { + if (args->attr_flags & XATTR_CREATE) return retval; retval = xfs_attr_shortform_remove(args); if (retval) @@ -549,7 +436,7 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) * that the leaf format add routine won't trip over the attr * not being around. */ - args->flags &= ~ATTR_REPLACE; + args->attr_flags &= ~XATTR_REPLACE; } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || @@ -602,14 +489,11 @@ xfs_attr_leaf_addname( * the given flags produce an error or call for an atomic rename. */ retval = xfs_attr3_leaf_lookup_int(bp, args); - if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { - xfs_trans_brelse(args->trans, bp); - return retval; - } else if (retval == -EEXIST) { - if (args->flags & ATTR_CREATE) { /* pure create op */ - xfs_trans_brelse(args->trans, bp); - return retval; - } + if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) + goto out_brelse; + if (retval == -EEXIST) { + if (args->attr_flags & XATTR_CREATE) + goto out_brelse; trace_xfs_attr_leaf_replace(args); @@ -750,6 +634,9 @@ xfs_attr_leaf_addname( error = xfs_attr3_leaf_clearflag(args); } return error; +out_brelse: + xfs_trans_brelse(args->trans, bp); + return retval; } /* @@ -876,10 +763,10 @@ restart: goto out; blk = &state->path.blk[ state->path.active-1 ]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) { + if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) goto out; - } else if (retval == -EEXIST) { - if (args->flags & ATTR_CREATE) + if (retval == -EEXIST) { + if (args->attr_flags & XATTR_CREATE) goto out; trace_xfs_attr_node_replace(args); @@ -1011,7 +898,7 @@ restart: * The INCOMPLETE flag means that we will find the "old" * attr, not the "new" one. */ - args->op_flags |= XFS_DA_OP_INCOMPLETE; + args->attr_filter |= XFS_ATTR_INCOMPLETE; state = xfs_da_state_alloc(); state->args = args; state->mp = mp; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 4243b2272642..0d2d05908537 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -21,39 +21,6 @@ struct xfs_attr_list_context; * as possible so as to fit into the literal area of the inode. */ -/*======================================================================== - * External interfaces - *========================================================================*/ - - -#define ATTR_DONTFOLLOW 0x0001 /* -- ignored, from IRIX -- */ -#define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */ -#define ATTR_TRUST 0x0004 /* -- unused, from IRIX -- */ -#define ATTR_SECURE 0x0008 /* use attrs in security namespace */ -#define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */ -#define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */ - -#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */ -#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ - -#define ATTR_INCOMPLETE 0x4000 /* [kernel] return INCOMPLETE attr keys */ -#define ATTR_ALLOC 0x8000 /* [kernel] allocate xattr buffer on demand */ - -#define ATTR_KERNEL_FLAGS \ - (ATTR_KERNOTIME | ATTR_KERNOVAL | ATTR_INCOMPLETE | ATTR_ALLOC) - -#define XFS_ATTR_FLAGS \ - { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ - { ATTR_ROOT, "ROOT" }, \ - { ATTR_TRUST, "TRUST" }, \ - { ATTR_SECURE, "SECURE" }, \ - { ATTR_CREATE, "CREATE" }, \ - { ATTR_REPLACE, "REPLACE" }, \ - { ATTR_KERNOTIME, "KERNOTIME" }, \ - { ATTR_KERNOVAL, "KERNOVAL" }, \ - { ATTR_INCOMPLETE, "INCOMPLETE" }, \ - { ATTR_ALLOC, "ALLOC" } - /* * The maximum size (into the kernel or returned from the kernel) of an * attribute value or the buffer used for an attr_list() call. Larger @@ -62,45 +29,16 @@ struct xfs_attr_list_context; #define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */ /* - * Define how lists of attribute names are returned to the user from - * the attr_list() call. A large, 32bit aligned, buffer is passed in - * along with its size. We put an array of offsets at the top that each - * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom. - */ -typedef struct attrlist { - __s32 al_count; /* number of entries in attrlist */ - __s32 al_more; /* T/F: more attrs (do call again) */ - __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ -} attrlist_t; - -/* - * Show the interesting info about one attribute. This is what the - * al_offset[i] entry points to. - */ -typedef struct attrlist_ent { /* data from attr_list() */ - __u32 a_valuelen; /* number bytes in value of attr */ - char a_name[1]; /* attr name (NULL terminated) */ -} attrlist_ent_t; - -/* - * Given a pointer to the (char*) buffer containing the attr_list() result, - * and an index, return a pointer to the indicated attribute in the buffer. - */ -#define ATTR_ENTRY(buffer, index) \ - ((attrlist_ent_t *) \ - &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ]) - -/* * Kernel-internal version of the attrlist cursor. */ -typedef struct attrlist_cursor_kern { +struct xfs_attrlist_cursor_kern { __u32 hashval; /* hash value of next entry to add */ __u32 blkno; /* block containing entry (suggestion) */ __u32 offset; /* offset in list of equal-hashvals */ __u16 pad1; /* padding to match user-level */ __u8 pad2; /* padding to match user-level */ __u8 initted; /* T/F: cursor has been initialized */ -} attrlist_cursor_kern_t; +}; /*======================================================================== @@ -112,27 +50,28 @@ typedef struct attrlist_cursor_kern { typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int, unsigned char *, int, int); -typedef struct xfs_attr_list_context { - struct xfs_trans *tp; - struct xfs_inode *dp; /* inode */ - struct attrlist_cursor_kern *cursor; /* position in list */ - char *alist; /* output buffer */ +struct xfs_attr_list_context { + struct xfs_trans *tp; + struct xfs_inode *dp; /* inode */ + struct xfs_attrlist_cursor_kern cursor; /* position in list */ + void *buffer; /* output buffer */ /* * Abort attribute list iteration if non-zero. Can be used to pass * error values to the xfs_attr_list caller. */ - int seen_enough; + int seen_enough; + bool allow_incomplete; - ssize_t count; /* num used entries */ - int dupcnt; /* count dup hashvals seen */ - int bufsize; /* total buffer size */ - int firstu; /* first used byte in buffer */ - int flags; /* from VOP call */ - int resynch; /* T/F: resynch with cursor */ - put_listent_func_t put_listent; /* list output fmt function */ - int index; /* index into output buffer */ -} xfs_attr_list_context_t; + ssize_t count; /* num used entries */ + int dupcnt; /* count dup hashvals seen */ + int bufsize; /* total buffer size */ + int firstu; /* first used byte in buffer */ + unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE} */ + int resynch; /* T/F: resynch with cursor */ + put_listent_func_t put_listent; /* list output fmt function */ + int index; /* index into output buffer */ +}; /*======================================================================== @@ -143,21 +82,14 @@ typedef struct xfs_attr_list_context { * Overall external interface routines. */ int xfs_attr_inactive(struct xfs_inode *dp); -int xfs_attr_list_int_ilocked(struct xfs_attr_list_context *); -int xfs_attr_list_int(struct xfs_attr_list_context *); +int xfs_attr_list_ilocked(struct xfs_attr_list_context *); +int xfs_attr_list(struct xfs_attr_list_context *); int xfs_inode_hasattr(struct xfs_inode *ip); -int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args); -int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, - size_t namelen, unsigned char **value, int *valuelenp, - int flags); -int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, - size_t namelen, unsigned char *value, int valuelen, int flags); +int xfs_attr_get_ilocked(struct xfs_da_args *args); +int xfs_attr_get(struct xfs_da_args *args); +int xfs_attr_set(struct xfs_da_args *args); int xfs_attr_set_args(struct xfs_da_args *args); -int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, - size_t namelen, int flags); int xfs_attr_remove_args(struct xfs_da_args *args); -int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, - int flags, struct attrlist_cursor_kern *cursor); bool xfs_attr_namecheck(const void *name, size_t length); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index fed537a4353d..863444e2dda7 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -445,14 +445,25 @@ xfs_attr3_leaf_read( * Namespace helper routines *========================================================================*/ -/* - * If namespace bits don't match return 0. - * If all match then return 1. - */ -STATIC int -xfs_attr_namesp_match(int arg_flags, int ondisk_flags) +static bool +xfs_attr_match( + struct xfs_da_args *args, + uint8_t namelen, + unsigned char *name, + int flags) { - return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); + if (args->namelen != namelen) + return false; + if (memcmp(args->name, name, namelen) != 0) + return false; + /* + * If we are looking for incomplete entries, show only those, else only + * show complete entries. + */ + if (args->attr_filter != + (flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE))) + return false; + return true; } static int @@ -464,7 +475,7 @@ xfs_attr_copy_value( /* * No copy if all we have to do is get the length */ - if (args->flags & ATTR_KERNOVAL) { + if (!args->valuelen) { args->valuelen = valuelen; return 0; } @@ -477,7 +488,7 @@ xfs_attr_copy_value( return -ERANGE; } - if (args->op_flags & XFS_DA_OP_ALLOCVAL) { + if (!args->value) { args->value = kmem_alloc_large(valuelen, 0); if (!args->value) return -ENOMEM; @@ -526,7 +537,7 @@ xfs_attr_shortform_bytesfit( int offset; /* rounded down */ - offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3; + offset = (XFS_LITINO(mp) - bytes) >> 3; if (dp->i_d.di_format == XFS_DINODE_FMT_DEV) { minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; @@ -593,8 +604,7 @@ xfs_attr_shortform_bytesfit( minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ - maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) - - XFS_BMDR_SPACE_CALC(MINABTPTRS); + maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); maxforkoff = maxforkoff >> 3; /* rounded down */ if (offset >= maxforkoff) @@ -678,15 +688,8 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { -#ifdef DEBUG - if (sfe->namelen != args->namelen) - continue; - if (memcmp(args->name, sfe->nameval, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; - ASSERT(0); -#endif + ASSERT(!xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)); } offset = (char *)sfe - (char *)sf; @@ -697,7 +700,7 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) sfe->namelen = args->namelen; sfe->valuelen = args->valuelen; - sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); + sfe->flags = args->attr_filter; memcpy(sfe->nameval, args->name, args->namelen); memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen); sf->hdr.count++; @@ -749,13 +752,9 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) for (i = 0; i < end; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), base += size, i++) { size = XFS_ATTR_SF_ENTSIZE(sfe); - if (sfe->namelen != args->namelen) - continue; - if (memcmp(sfe->nameval, args->name, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; - break; + if (xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + break; } if (i == end) return -ENOATTR; @@ -816,13 +815,9 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { - if (sfe->namelen != args->namelen) - continue; - if (memcmp(args->name, sfe->nameval, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; - return -EEXIST; + if (xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + return -EEXIST; } return -ENOATTR; } @@ -830,9 +825,9 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) /* * Retrieve the attribute value and length. * - * If ATTR_KERNOVAL is specified, only the length needs to be returned. - * Unlike a lookup, we only return an error if the attribute does not - * exist or we can't retrieve the value. + * If args->valuelen is zero, only the length needs to be returned. Unlike a + * lookup, we only return an error if the attribute does not exist or we can't + * retrieve the value. */ int xfs_attr_shortform_getvalue( @@ -847,14 +842,10 @@ xfs_attr_shortform_getvalue( sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; sfe = XFS_ATTR_SF_NEXTENTRY(sfe), i++) { - if (sfe->namelen != args->namelen) - continue; - if (memcmp(args->name, sfe->nameval, args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, sfe->flags)) - continue; - return xfs_attr_copy_value(args, &sfe->nameval[args->namelen], - sfe->valuelen); + if (xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + return xfs_attr_copy_value(args, + &sfe->nameval[args->namelen], sfe->valuelen); } return -ENOATTR; } @@ -918,7 +909,7 @@ xfs_attr_shortform_to_leaf( nargs.valuelen = sfe->valuelen; nargs.hashval = xfs_da_hashname(sfe->nameval, sfe->namelen); - nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); + nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK; error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == -ENOATTR); error = xfs_attr3_leaf_add(bp, &nargs); @@ -1124,7 +1115,7 @@ xfs_attr3_leaf_to_shortform( nargs.value = &name_loc->nameval[nargs.namelen]; nargs.valuelen = be16_to_cpu(name_loc->valuelen); nargs.hashval = be32_to_cpu(entry->hashval); - nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags); + nargs.attr_filter = entry->flags & XFS_ATTR_NSP_ONDISK_MASK; xfs_attr_shortform_add(&nargs, forkoff); } error = 0; @@ -1449,8 +1440,9 @@ xfs_attr3_leaf_add_work( entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base + ichdr->freemap[mapindex].size); entry->hashval = cpu_to_be32(args->hashval); - entry->flags = tmp ? XFS_ATTR_LOCAL : 0; - entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); + entry->flags = args->attr_filter; + if (tmp) + entry->flags |= XFS_ATTR_LOCAL; if (args->op_flags & XFS_DA_OP_RENAME) { entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && @@ -2346,7 +2338,7 @@ xfs_attr3_leaf_lookup_int( xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); if (ichdr.count >= args->geo->blksize / 8) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } @@ -2365,11 +2357,11 @@ xfs_attr3_leaf_lookup_int( break; } if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } @@ -2399,33 +2391,17 @@ xfs_attr3_leaf_lookup_int( /* * GROT: Add code to remove incomplete entries. */ - /* - * If we are looking for INCOMPLETE entries, show only those. - * If we are looking for complete entries, show only those. - */ - if (!!(args->op_flags & XFS_DA_OP_INCOMPLETE) != - !!(entry->flags & XFS_ATTR_INCOMPLETE)) { - continue; - } if (entry->flags & XFS_ATTR_LOCAL) { name_loc = xfs_attr3_leaf_name_local(leaf, probe); - if (name_loc->namelen != args->namelen) - continue; - if (memcmp(args->name, name_loc->nameval, - args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, entry->flags)) + if (!xfs_attr_match(args, name_loc->namelen, + name_loc->nameval, entry->flags)) continue; args->index = probe; return -EEXIST; } else { name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); - if (name_rmt->namelen != args->namelen) - continue; - if (memcmp(args->name, name_rmt->name, - args->namelen) != 0) - continue; - if (!xfs_attr_namesp_match(args->flags, entry->flags)) + if (!xfs_attr_match(args, name_rmt->namelen, + name_rmt->name, entry->flags)) continue; args->index = probe; args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); @@ -2444,9 +2420,9 @@ xfs_attr3_leaf_lookup_int( * Get the value associated with an attribute name from a leaf attribute * list structure. * - * If ATTR_KERNOVAL is specified, only the length needs to be returned. - * Unlike a lookup, we only return an error if the attribute does not - * exist or we can't retrieve the value. + * If args->valuelen is zero, only the length needs to be returned. Unlike a + * lookup, we only return an error if the attribute does not exist or we can't + * retrieve the value. */ int xfs_attr3_leaf_getvalue( diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 73615b1dd1a8..6dd2d937a42a 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -8,7 +8,6 @@ #define __XFS_ATTR_LEAF_H__ struct attrlist; -struct attrlist_cursor_kern; struct xfs_attr_list_context; struct xfs_da_args; struct xfs_da_state; diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 8b7f74b3bea2..01ad7f353e08 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -397,7 +397,7 @@ xfs_attr_rmtval_get( trace_xfs_attr_rmtval_get(args); - ASSERT(!(args->flags & ATTR_KERNOVAL)); + ASSERT(args->valuelen != 0); ASSERT(args->rmtvaluelen == args->valuelen); valuelen = args->rmtvaluelen; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 9a6d7a84689a..fda13cd7add0 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -193,14 +193,12 @@ xfs_default_attroffset( struct xfs_mount *mp = ip->i_mount; uint offset; - if (mp->m_sb.sb_inodesize == 256) { - offset = XFS_LITINO(mp, ip->i_d.di_version) - - XFS_BMDR_SPACE_CALC(MINABTPTRS); - } else { + if (mp->m_sb.sb_inodesize == 256) + offset = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); + else offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); - } - ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version)); + ASSERT(offset < XFS_LITINO(mp)); return offset; } @@ -690,7 +688,7 @@ xfs_bmap_extents_to_btree( * Need a cursor. Can't allocate until bb_level is filled in. */ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + cur->bc_ino.flags = wasdel ? XFS_BTCUR_BMBT_WASDEL : 0; /* * Convert to a btree with two levels, one record in root. */ @@ -727,7 +725,7 @@ xfs_bmap_extents_to_btree( ASSERT(tp->t_firstblock == NULLFSBLOCK || args.agno >= XFS_FSB_TO_AGNO(mp, tp->t_firstblock)); tp->t_firstblock = args.fsbno; - cur->bc_private.b.allocated++; + cur->bc_ino.allocated++; ip->i_d.di_nblocks++; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, @@ -953,7 +951,7 @@ xfs_bmap_add_attrfork_btree( xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return -ENOSPC; } - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); } return 0; @@ -980,7 +978,7 @@ xfs_bmap_add_attrfork_extents( error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags, XFS_DATA_FORK); if (cur) { - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, error); } return error; @@ -1178,13 +1176,13 @@ xfs_iread_bmbt_block( { struct xfs_iread_state *ir = priv; struct xfs_mount *mp = cur->bc_mp; - struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_btree_block *block; struct xfs_buf *bp; struct xfs_bmbt_rec *frp; xfs_extnum_t num_recs; xfs_extnum_t j; - int whichfork = cur->bc_private.b.whichfork; + int whichfork = cur->bc_ino.whichfork; block = xfs_btree_get_block(cur, level, &bp); @@ -1528,7 +1526,7 @@ xfs_bmap_add_extent_delay_real( ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || - (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + (bma->cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); @@ -1818,7 +1816,7 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + (bma->cur ? bma->cur->bc_ino.allocated : 0)); PREV.br_startoff = new_endoff; PREV.br_blockcount = temp; @@ -1904,7 +1902,7 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + (bma->cur ? bma->cur->bc_ino.allocated : 0)); PREV.br_startblock = nullstartblock(da_new); PREV.br_blockcount = temp; @@ -2025,8 +2023,8 @@ xfs_bmap_add_extent_delay_real( xfs_mod_delalloc(mp, (int64_t)da_new - da_old); if (bma->cur) { - da_new += bma->cur->bc_private.b.allocated; - bma->cur->bc_private.b.allocated = 0; + da_new += bma->cur->bc_ino.allocated; + bma->cur->bc_ino.allocated = 0; } /* adjust for changes in reserved delayed indirect blocks */ @@ -2573,7 +2571,7 @@ xfs_bmap_add_extent_unwritten_real( /* clear out the allocated field, done with it now in any case. */ if (cur) { - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; *curp = cur; } @@ -2752,7 +2750,7 @@ xfs_bmap_add_extent_hole_real( struct xfs_bmbt_irec old; ASSERT(!isnullstartblock(new->br_startblock)); - ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + ASSERT(!cur || !(cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); @@ -2955,7 +2953,7 @@ xfs_bmap_add_extent_hole_real( /* clear out the allocated field, done with it now in any case. */ if (cur) - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_bmap_check_leaf_extents(cur, ip, whichfork); done: @@ -4187,8 +4185,8 @@ xfs_bmapi_allocate( bma->nallocs++; if (bma->cur) - bma->cur->bc_private.b.flags = - bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + bma->cur->bc_ino.flags = + bma->wasdel ? XFS_BTCUR_BMBT_WASDEL : 0; bma->got.br_startoff = bma->offset; bma->got.br_startblock = bma->blkno; @@ -4709,7 +4707,7 @@ xfs_bmapi_remap( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } got.br_startoff = bno; @@ -5364,7 +5362,7 @@ __xfs_bunmapi( if (ifp->if_flags & XFS_IFBROOT) { ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } else cur = NULL; @@ -5620,7 +5618,7 @@ error0: xfs_trans_log_inode(tp, ip, logflags); if (cur) { if (!error) - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, error); } return error; @@ -5839,7 +5837,7 @@ xfs_bmap_collapse_extents( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) { @@ -5956,7 +5954,7 @@ xfs_bmap_insert_extents( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; } if (*next_fsb == NULLFSBLOCK) { @@ -6025,8 +6023,8 @@ del_cursor: * @split_fsb is a block where the extents is split. If split_fsb lies in a * hole or the first block of extents, just return 0. */ -STATIC int -xfs_bmap_split_extent_at( +int +xfs_bmap_split_extent( struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t split_fsb) @@ -6074,7 +6072,7 @@ xfs_bmap_split_extent_at( if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_private.b.flags = 0; + cur->bc_ino.flags = 0; error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) goto del_cursor; @@ -6133,7 +6131,7 @@ xfs_bmap_split_extent_at( del_cursor: if (cur) { - cur->bc_private.b.allocated = 0; + cur->bc_ino.allocated = 0; xfs_btree_del_cursor(cur, error); } @@ -6142,34 +6140,6 @@ del_cursor: return error; } -int -xfs_bmap_split_extent( - struct xfs_inode *ip, - xfs_fileoff_t split_fsb) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - error = xfs_bmap_split_extent_at(tp, ip, split_fsb); - if (error) - goto out; - - return xfs_trans_commit(tp); - -out: - xfs_trans_cancel(tp); - return error; -} - /* Deferred mapping is only for real extents in the data fork. */ static bool xfs_bmap_is_update_needed( diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 14d25e0b7d9c..f3259ad5c22c 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -222,7 +222,8 @@ int xfs_bmap_can_insert_extents(struct xfs_inode *ip, xfs_fileoff_t off, int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, xfs_fileoff_t stop_fsb); -int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); +int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t split_offset); int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index ffe608d2a2d9..295a59cf8840 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -166,13 +166,13 @@ xfs_bmbt_dup_cursor( struct xfs_btree_cur *new; new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.b.ip, cur->bc_private.b.whichfork); + cur->bc_ino.ip, cur->bc_ino.whichfork); /* * Copy the firstblock, dfops, and flags values, * since init cursor doesn't get them. */ - new->bc_private.b.flags = cur->bc_private.b.flags; + new->bc_ino.flags = cur->bc_ino.flags; return new; } @@ -183,12 +183,12 @@ xfs_bmbt_update_cursor( struct xfs_btree_cur *dst) { ASSERT((dst->bc_tp->t_firstblock != NULLFSBLOCK) || - (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME)); + (dst->bc_ino.ip->i_d.di_flags & XFS_DIFLAG_REALTIME)); - dst->bc_private.b.allocated += src->bc_private.b.allocated; + dst->bc_ino.allocated += src->bc_ino.allocated; dst->bc_tp->t_firstblock = src->bc_tp->t_firstblock; - src->bc_private.b.allocated = 0; + src->bc_ino.allocated = 0; } STATIC int @@ -205,8 +205,8 @@ xfs_bmbt_alloc_block( args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.fsbno = cur->bc_tp->t_firstblock; - xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_private.b.ip->i_ino, - cur->bc_private.b.whichfork); + xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino, + cur->bc_ino.whichfork); if (args.fsbno == NULLFSBLOCK) { args.fsbno = be64_to_cpu(start->l); @@ -230,7 +230,7 @@ xfs_bmbt_alloc_block( } args.minlen = args.maxlen = args.prod = 1; - args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; + args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL; if (!args.wasdel && args.tp->t_blk_res == 0) { error = -ENOSPC; goto error0; @@ -259,10 +259,10 @@ xfs_bmbt_alloc_block( ASSERT(args.len == 1); cur->bc_tp->t_firstblock = args.fsbno; - cur->bc_private.b.allocated++; - cur->bc_private.b.ip->i_d.di_nblocks++; - xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE); - xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip, + cur->bc_ino.allocated++; + cur->bc_ino.ip->i_d.di_nblocks++; + xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE); + xfs_trans_mod_dquot_byino(args.tp, cur->bc_ino.ip, XFS_TRANS_DQ_BCOUNT, 1L); new->l = cpu_to_be64(args.fsbno); @@ -280,12 +280,12 @@ xfs_bmbt_free_block( struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; - struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_trans *tp = cur->bc_tp; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); struct xfs_owner_info oinfo; - xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_private.b.whichfork); + xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); xfs_bmap_add_free(cur->bc_tp, fsbno, 1, &oinfo); ip->i_d.di_nblocks--; @@ -302,8 +302,8 @@ xfs_bmbt_get_minrecs( if (level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, - cur->bc_private.b.whichfork); + ifp = XFS_IFORK_PTR(cur->bc_ino.ip, + cur->bc_ino.whichfork); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0) / 2; @@ -320,8 +320,8 @@ xfs_bmbt_get_maxrecs( if (level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, - cur->bc_private.b.whichfork); + ifp = XFS_IFORK_PTR(cur->bc_ino.ip, + cur->bc_ino.whichfork); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0); @@ -347,7 +347,7 @@ xfs_bmbt_get_dmaxrecs( { if (level != cur->bc_nlevels - 1) return cur->bc_mp->m_bmap_dmxr[level != 0]; - return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0); + return xfs_bmdr_maxrecs(cur->bc_ino.forksize, level == 0); } STATIC void @@ -566,11 +566,11 @@ xfs_bmbt_init_cursor( if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); - cur->bc_private.b.ip = ip; - cur->bc_private.b.allocated = 0; - cur->bc_private.b.flags = 0; - cur->bc_private.b.whichfork = whichfork; + cur->bc_ino.forksize = XFS_IFORK_SIZE(ip, whichfork); + cur->bc_ino.ip = ip; + cur->bc_ino.allocated = 0; + cur->bc_ino.flags = 0; + cur->bc_ino.whichfork = whichfork; return cur; } @@ -644,7 +644,7 @@ xfs_bmbt_change_owner( cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); if (!cur) return -ENOMEM; - cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER; + cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER; error = xfs_btree_change_owner(cur, new_owner, buffer_list); xfs_btree_del_cursor(cur, error); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index fd300dc93ca4..2d25bab68764 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -20,6 +20,7 @@ #include "xfs_trace.h" #include "xfs_alloc.h" #include "xfs_log.h" +#include "xfs_btree_staging.h" /* * Cursor allocation zone. @@ -214,7 +215,7 @@ xfs_btree_check_sptr( { if (level <= 0) return false; - return xfs_verify_agbno(cur->bc_mp, cur->bc_private.a.agno, agbno); + return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.agno, agbno); } /* @@ -234,8 +235,8 @@ xfs_btree_check_ptr( return 0; xfs_err(cur->bc_mp, "Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.", - cur->bc_private.b.ip->i_ino, - cur->bc_private.b.whichfork, cur->bc_btnum, + cur->bc_ino.ip->i_ino, + cur->bc_ino.whichfork, cur->bc_btnum, level, index); } else { if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]), @@ -243,7 +244,7 @@ xfs_btree_check_ptr( return 0; xfs_err(cur->bc_mp, "AG %u: Corrupt btree %d pointer at level %d index %d.", - cur->bc_private.a.agno, cur->bc_btnum, + cur->bc_ag.agno, cur->bc_btnum, level, index); } @@ -378,10 +379,12 @@ xfs_btree_del_cursor( * allocated indirect blocks' accounting. */ ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || - cur->bc_private.b.allocated == 0); + cur->bc_ino.allocated == 0); /* * Free the cursor. */ + if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) + kmem_free((void *)cur->bc_ops); kmem_cache_free(xfs_btree_cur_zone, cur); } @@ -642,6 +645,17 @@ xfs_btree_ptr_addr( ((char *)block + xfs_btree_ptr_offset(cur, n, level)); } +struct xfs_ifork * +xfs_btree_ifork_ptr( + struct xfs_btree_cur *cur) +{ + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + + if (cur->bc_flags & XFS_BTREE_STAGING) + return cur->bc_ino.ifake->if_fork; + return XFS_IFORK_PTR(cur->bc_ino.ip, cur->bc_ino.whichfork); +} + /* * Get the root block which is stored in the inode. * @@ -652,9 +666,8 @@ STATIC struct xfs_btree_block * xfs_btree_get_iroot( struct xfs_btree_cur *cur) { - struct xfs_ifork *ifp; + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); return (struct xfs_btree_block *)ifp->if_broot; } @@ -881,13 +894,13 @@ xfs_btree_readahead_sblock( if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno, left, 1, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.agno, right, 1, cur->bc_ops->buf_ops); rval++; } @@ -945,7 +958,7 @@ xfs_btree_ptr_to_daddr( *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno); } else { agbno = be32_to_cpu(ptr->s); - *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, + *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.agno, agbno); } @@ -1014,7 +1027,7 @@ xfs_btree_ptr_is_null( return ptr->s == cpu_to_be32(NULLAGBLOCK); } -STATIC void +void xfs_btree_set_ptr_null( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) @@ -1050,7 +1063,7 @@ xfs_btree_get_sibling( } } -STATIC void +void xfs_btree_set_sibling( struct xfs_btree_cur *cur, struct xfs_btree_block *block, @@ -1128,7 +1141,7 @@ xfs_btree_init_block( btnum, level, numrecs, owner, 0); } -STATIC void +void xfs_btree_init_block_cur( struct xfs_btree_cur *cur, struct xfs_buf *bp, @@ -1144,9 +1157,9 @@ xfs_btree_init_block_cur( * code. */ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - owner = cur->bc_private.b.ip->i_ino; + owner = cur->bc_ino.ip->i_ino; else - owner = cur->bc_private.a.agno; + owner = cur->bc_ag.agno; xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, cur->bc_btnum, level, numrecs, @@ -1220,7 +1233,7 @@ xfs_btree_set_refs( } } -STATIC int +int xfs_btree_get_buf_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, @@ -1280,7 +1293,7 @@ xfs_btree_read_buf_block( /* * Copy keys from one btree block to another. */ -STATIC void +void xfs_btree_copy_keys( struct xfs_btree_cur *cur, union xfs_btree_key *dst_key, @@ -1308,11 +1321,11 @@ xfs_btree_copy_recs( /* * Copy block pointers from one btree block to another. */ -STATIC void +void xfs_btree_copy_ptrs( struct xfs_btree_cur *cur, union xfs_btree_ptr *dst_ptr, - union xfs_btree_ptr *src_ptr, + const union xfs_btree_ptr *src_ptr, int numptrs) { ASSERT(numptrs >= 0); @@ -1393,8 +1406,8 @@ xfs_btree_log_keys( xfs_btree_key_offset(cur, first), xfs_btree_key_offset(cur, last + 1) - 1); } else { - xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, - xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, + xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } @@ -1436,8 +1449,8 @@ xfs_btree_log_ptrs( xfs_btree_ptr_offset(cur, first, level), xfs_btree_ptr_offset(cur, last + 1, level) - 1); } else { - xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, - xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, + xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } @@ -1505,8 +1518,8 @@ xfs_btree_log_block( xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, first, last); } else { - xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, - xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip, + xfs_ilog_fbroot(cur->bc_ino.whichfork)); } } @@ -1743,10 +1756,10 @@ xfs_btree_lookup_get_block( /* Check the inode owner since the verifiers don't. */ if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && - !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) && + !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) && (cur->bc_flags & XFS_BTREE_LONG_PTRS) && be64_to_cpu((*blkp)->bb_u.l.bb_owner) != - cur->bc_private.b.ip->i_ino) + cur->bc_ino.ip->i_ino) goto out_bad; /* Did we get the level we were looking for? */ @@ -1762,7 +1775,7 @@ xfs_btree_lookup_get_block( out_bad: *blkp = NULL; - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(cur->bc_tp, bp); return -EFSCORRUPTED; } @@ -2938,9 +2951,9 @@ xfs_btree_new_iroot( xfs_btree_copy_ptrs(cur, pp, &nptr, 1); - xfs_iroot_realloc(cur->bc_private.b.ip, + xfs_iroot_realloc(cur->bc_ino.ip, 1 - xfs_btree_get_numrecs(cblock), - cur->bc_private.b.whichfork); + cur->bc_ino.whichfork); xfs_btree_setbuf(cur, level, cbp); @@ -2953,7 +2966,7 @@ xfs_btree_new_iroot( xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); *logflags |= - XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork); + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork); *stat = 1; return 0; error0: @@ -3105,11 +3118,11 @@ xfs_btree_make_block_unfull( if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && level == cur->bc_nlevels - 1) { - struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_inode *ip = cur->bc_ino.ip; if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { /* A root block that can be made bigger. */ - xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork); + xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork); *stat = 1; } else { /* A root block that needs replacing */ @@ -3455,8 +3468,8 @@ STATIC int xfs_btree_kill_iroot( struct xfs_btree_cur *cur) { - int whichfork = cur->bc_private.b.whichfork; - struct xfs_inode *ip = cur->bc_private.b.ip; + int whichfork = cur->bc_ino.whichfork; + struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_btree_block *block; struct xfs_btree_block *cblock; @@ -3514,8 +3527,8 @@ xfs_btree_kill_iroot( index = numrecs - cur->bc_ops->get_maxrecs(cur, level); if (index) { - xfs_iroot_realloc(cur->bc_private.b.ip, index, - cur->bc_private.b.whichfork); + xfs_iroot_realloc(cur->bc_ino.ip, index, + cur->bc_ino.whichfork); block = ifp->if_broot; } @@ -3544,7 +3557,7 @@ xfs_btree_kill_iroot( cur->bc_bufs[level - 1] = NULL; be16_add_cpu(&block->bb_level, -1); xfs_trans_log_inode(cur->bc_tp, ip, - XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); cur->bc_nlevels--; out0: return 0; @@ -3712,8 +3725,8 @@ xfs_btree_delrec( */ if (level == cur->bc_nlevels - 1) { if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { - xfs_iroot_realloc(cur->bc_private.b.ip, -1, - cur->bc_private.b.whichfork); + xfs_iroot_realloc(cur->bc_ino.ip, -1, + cur->bc_ino.whichfork); error = xfs_btree_kill_iroot(cur); if (error) diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 3eff7c321d43..8626c5a81aad 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -10,6 +10,7 @@ struct xfs_buf; struct xfs_inode; struct xfs_mount; struct xfs_trans; +struct xfs_ifork; extern kmem_zone_t *xfs_btree_cur_zone; @@ -177,15 +178,37 @@ union xfs_btree_irec { struct xfs_refcount_irec rc; }; -/* Per-AG btree private information. */ -union xfs_btree_cur_private { - struct { - unsigned long nr_ops; /* # record updates */ - int shape_changes; /* # of extent splits */ - } refc; - struct { - bool active; /* allocation cursor state */ - } abt; +/* Per-AG btree information. */ +struct xfs_btree_cur_ag { + union { + struct xfs_buf *agbp; + struct xbtree_afakeroot *afake; /* for staging cursor */ + }; + xfs_agnumber_t agno; + union { + struct { + unsigned long nr_ops; /* # record updates */ + int shape_changes; /* # of extent splits */ + } refc; + struct { + bool active; /* allocation cursor state */ + } abt; + }; +}; + +/* Btree-in-inode cursor information */ +struct xfs_btree_cur_ino { + struct xfs_inode *ip; + struct xbtree_ifakeroot *ifake; /* for staging cursor */ + int allocated; + short forksize; + char whichfork; + char flags; +/* We are converting a delalloc reservation */ +#define XFS_BTCUR_BMBT_WASDEL (1 << 0) + +/* For extent swap, ignore owner check in verifier */ +#define XFS_BTCUR_BMBT_INVALID_OWNER (1 << 1) }; /* @@ -209,21 +232,9 @@ typedef struct xfs_btree_cur xfs_btnum_t bc_btnum; /* identifies which btree type */ int bc_statoff; /* offset of btre stats array */ union { - struct { /* needed for BNO, CNT, INO */ - struct xfs_buf *agbp; /* agf/agi buffer pointer */ - xfs_agnumber_t agno; /* ag number */ - union xfs_btree_cur_private priv; - } a; - struct { /* needed for BMAP */ - struct xfs_inode *ip; /* pointer to our inode */ - int allocated; /* count of alloced */ - short forksize; /* fork's inode space */ - char whichfork; /* data or attr fork */ - char flags; /* flags */ -#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */ -#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */ - } b; - } bc_private; /* per-btree type data */ + struct xfs_btree_cur_ag bc_ag; + struct xfs_btree_cur_ino bc_ino; + }; } xfs_btree_cur_t; /* cursor flags */ @@ -232,6 +243,12 @@ typedef struct xfs_btree_cur #define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ #define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ #define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */ +/* + * The root of this btree is a fakeroot structure so that we can stage a btree + * rebuild without leaving it accessible via primary metadata. The ops struct + * is dynamically allocated and must be freed when the cursor is deleted. + */ +#define XFS_BTREE_STAGING (1<<5) #define XFS_BTREE_NOERROR 0 @@ -494,6 +511,7 @@ union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low, union xfs_btree_irec *high, bool *exists); bool xfs_btree_has_more_records(struct xfs_btree_cur *cur); +struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur); /* Does this cursor point to the last block in the given level? */ static inline bool @@ -512,4 +530,20 @@ xfs_btree_islastblock( return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); } +void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr); +int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, + struct xfs_btree_block **block, struct xfs_buf **bpp); +void xfs_btree_set_sibling(struct xfs_btree_cur *cur, + struct xfs_btree_block *block, union xfs_btree_ptr *ptr, + int lr); +void xfs_btree_init_block_cur(struct xfs_btree_cur *cur, + struct xfs_buf *bp, int level, int numrecs); +void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur, + union xfs_btree_ptr *dst_ptr, + const union xfs_btree_ptr *src_ptr, int numptrs); +void xfs_btree_copy_keys(struct xfs_btree_cur *cur, + union xfs_btree_key *dst_key, union xfs_btree_key *src_key, + int numkeys); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c new file mode 100644 index 000000000000..f464a7c7cf22 --- /dev/null +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -0,0 +1,879 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2020 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_btree.h" +#include "xfs_trace.h" +#include "xfs_btree_staging.h" + +/* + * Staging Cursors and Fake Roots for Btrees + * ========================================= + * + * A staging btree cursor is a special type of btree cursor that callers must + * use to construct a new btree index using the btree bulk loader code. The + * bulk loading code uses the staging btree cursor to abstract the details of + * initializing new btree blocks and filling them with records or key/ptr + * pairs. Regular btree operations (e.g. queries and modifications) are not + * supported with staging cursors, and callers must not invoke them. + * + * Fake root structures contain all the information about a btree that is under + * construction by the bulk loading code. Staging btree cursors point to fake + * root structures instead of the usual AG header or inode structure. + * + * Callers are expected to initialize a fake root structure and pass it into + * the _stage_cursor function for a specific btree type. When bulk loading is + * complete, callers should call the _commit_staged_btree function for that + * specific btree type to commit the new btree into the filesystem. + */ + +/* + * Don't allow staging cursors to be duplicated because they're supposed to be + * kept private to a single thread. + */ +STATIC struct xfs_btree_cur * +xfs_btree_fakeroot_dup_cursor( + struct xfs_btree_cur *cur) +{ + ASSERT(0); + return NULL; +} + +/* + * Don't allow block allocation for a staging cursor, because staging cursors + * do not support regular btree modifications. + * + * Bulk loading uses a separate callback to obtain new blocks from a + * preallocated list, which prevents ENOSPC failures during loading. + */ +STATIC int +xfs_btree_fakeroot_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start_bno, + union xfs_btree_ptr *new_bno, + int *stat) +{ + ASSERT(0); + return -EFSCORRUPTED; +} + +/* + * Don't allow block freeing for a staging cursor, because staging cursors + * do not support regular btree modifications. + */ +STATIC int +xfs_btree_fakeroot_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + ASSERT(0); + return -EFSCORRUPTED; +} + +/* Initialize a pointer to the root block from the fakeroot. */ +STATIC void +xfs_btree_fakeroot_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xbtree_afakeroot *afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + afake = cur->bc_ag.afake; + ptr->s = cpu_to_be32(afake->af_root); +} + +/* + * Bulk Loading for AG Btrees + * ========================== + * + * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the + * staging cursor. Callers should initialize this to zero. + * + * The _stage_cursor() function for a specific btree type should call + * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging + * cursor. The corresponding _commit_staged_btree() function should log the + * new root and call xfs_btree_commit_afakeroot() to transform the staging + * cursor into a regular btree cursor. + */ + +/* Update the btree root information for a per-AG fake root. */ +STATIC void +xfs_btree_afakeroot_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int inc) +{ + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + afake->af_root = be32_to_cpu(ptr->s); + afake->af_levels += inc; +} + +/* + * Initialize a AG-rooted btree cursor with the given AG btree fake root. + * The btree cursor's bc_ops will be overridden as needed to make the staging + * functionality work. + */ +void +xfs_btree_stage_afakeroot( + struct xfs_btree_cur *cur, + struct xbtree_afakeroot *afake) +{ + struct xfs_btree_ops *nops; + + ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); + ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)); + ASSERT(cur->bc_tp == NULL); + + nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS); + memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); + nops->alloc_block = xfs_btree_fakeroot_alloc_block; + nops->free_block = xfs_btree_fakeroot_free_block; + nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur; + nops->set_root = xfs_btree_afakeroot_set_root; + nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; + + cur->bc_ag.afake = afake; + cur->bc_nlevels = afake->af_levels; + cur->bc_ops = nops; + cur->bc_flags |= XFS_BTREE_STAGING; +} + +/* + * Transform an AG-rooted staging btree cursor back into a regular cursor by + * substituting a real btree root for the fake one and restoring normal btree + * cursor ops. The caller must log the btree root change prior to calling + * this. + */ +void +xfs_btree_commit_afakeroot( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp, + const struct xfs_btree_ops *ops) +{ + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(cur->bc_tp == NULL); + + trace_xfs_btree_commit_afakeroot(cur); + + kmem_free((void *)cur->bc_ops); + cur->bc_ag.agbp = agbp; + cur->bc_ops = ops; + cur->bc_flags &= ~XFS_BTREE_STAGING; + cur->bc_tp = tp; +} + +/* + * Bulk Loading for Inode-Rooted Btrees + * ==================================== + * + * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to + * the staging cursor. This structure should be initialized as follows: + * + * - if_fork_size field should be set to the number of bytes available to the + * fork in the inode. + * + * - if_fork should point to a freshly allocated struct xfs_ifork. + * + * - if_format should be set to the appropriate fork type (e.g. + * XFS_DINODE_FMT_BTREE). + * + * All other fields must be zero. + * + * The _stage_cursor() function for a specific btree type should call + * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging + * cursor. The corresponding _commit_staged_btree() function should log the + * new root and call xfs_btree_commit_ifakeroot() to transform the staging + * cursor into a regular btree cursor. + */ + +/* + * Initialize an inode-rooted btree cursor with the given inode btree fake + * root. The btree cursor's bc_ops will be overridden as needed to make the + * staging functionality work. If new_ops is not NULL, these new ops will be + * passed out to the caller for further overriding. + */ +void +xfs_btree_stage_ifakeroot( + struct xfs_btree_cur *cur, + struct xbtree_ifakeroot *ifake, + struct xfs_btree_ops **new_ops) +{ + struct xfs_btree_ops *nops; + + ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_tp == NULL); + + nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS); + memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); + nops->alloc_block = xfs_btree_fakeroot_alloc_block; + nops->free_block = xfs_btree_fakeroot_free_block; + nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur; + nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; + + cur->bc_ino.ifake = ifake; + cur->bc_nlevels = ifake->if_levels; + cur->bc_ops = nops; + cur->bc_flags |= XFS_BTREE_STAGING; + + if (new_ops) + *new_ops = nops; +} + +/* + * Transform an inode-rooted staging btree cursor back into a regular cursor by + * substituting a real btree root for the fake one and restoring normal btree + * cursor ops. The caller must log the btree root change prior to calling + * this. + */ +void +xfs_btree_commit_ifakeroot( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + int whichfork, + const struct xfs_btree_ops *ops) +{ + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(cur->bc_tp == NULL); + + trace_xfs_btree_commit_ifakeroot(cur); + + kmem_free((void *)cur->bc_ops); + cur->bc_ino.ifake = NULL; + cur->bc_ino.whichfork = whichfork; + cur->bc_ops = ops; + cur->bc_flags &= ~XFS_BTREE_STAGING; + cur->bc_tp = tp; +} + +/* + * Bulk Loading of Staged Btrees + * ============================= + * + * This interface is used with a staged btree cursor to create a totally new + * btree with a large number of records (i.e. more than what would fit in a + * single root block). When the creation is complete, the new root can be + * linked atomically into the filesystem by committing the staged cursor. + * + * Creation of a new btree proceeds roughly as follows: + * + * The first step is to initialize an appropriate fake btree root structure and + * then construct a staged btree cursor. Refer to the block comments about + * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for + * more information about how to do this. + * + * The second step is to initialize a struct xfs_btree_bload context as + * documented in the structure definition. + * + * The third step is to call xfs_btree_bload_compute_geometry to compute the + * height of and the number of blocks needed to construct the btree. See the + * section "Computing the Geometry of the New Btree" for details about this + * computation. + * + * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and + * save them for later use by ->claim_block(). Bulk loading requires all + * blocks to be allocated beforehand to avoid ENOSPC failures midway through a + * rebuild, and to minimize seek distances of the new btree. + * + * Step five is to call xfs_btree_bload() to start constructing the btree. + * + * The final step is to commit the staging btree cursor, which logs the new + * btree root and turns the staging cursor into a regular cursor. The caller + * is responsible for cleaning up the previous btree blocks, if any. + * + * Computing the Geometry of the New Btree + * ======================================= + * + * The number of items placed in each btree block is computed via the following + * algorithm: For leaf levels, the number of items for the level is nr_records + * in the bload structure. For node levels, the number of items for the level + * is the number of blocks in the next lower level of the tree. For each + * level, the desired number of items per block is defined as: + * + * desired = max(minrecs, maxrecs - slack factor) + * + * The number of blocks for the level is defined to be: + * + * blocks = floor(nr_items / desired) + * + * Note this is rounded down so that the npb calculation below will never fall + * below minrecs. The number of items that will actually be loaded into each + * btree block is defined as: + * + * npb = nr_items / blocks + * + * Some of the leftmost blocks in the level will contain one extra record as + * needed to handle uneven division. If the number of records in any block + * would exceed maxrecs for that level, blocks is incremented and npb is + * recalculated. + * + * In other words, we compute the number of blocks needed to satisfy a given + * loading level, then spread the items as evenly as possible. + * + * The height and number of fs blocks required to create the btree are computed + * and returned via btree_height and nr_blocks. + */ + +/* + * Put a btree block that we're loading onto the ordered list and release it. + * The btree blocks will be written to disk when bulk loading is finished. + */ +static void +xfs_btree_bload_drop_buf( + struct list_head *buffers_list, + struct xfs_buf **bpp) +{ + if (*bpp == NULL) + return; + + if (!xfs_buf_delwri_queue(*bpp, buffers_list)) + ASSERT(0); + + xfs_buf_relse(*bpp); + *bpp = NULL; +} + +/* + * Allocate and initialize one btree block for bulk loading. + * + * The new btree block will have its level and numrecs fields set to the values + * of the level and nr_this_block parameters, respectively. + * + * The caller should ensure that ptrp, bpp, and blockp refer to the left + * sibling of the new block, if there is any. On exit, ptrp, bpp, and blockp + * will all point to the new block. + */ +STATIC int +xfs_btree_bload_prep_block( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + struct list_head *buffers_list, + unsigned int level, + unsigned int nr_this_block, + union xfs_btree_ptr *ptrp, /* in/out */ + struct xfs_buf **bpp, /* in/out */ + struct xfs_btree_block **blockp, /* in/out */ + void *priv) +{ + union xfs_btree_ptr new_ptr; + struct xfs_buf *new_bp; + struct xfs_btree_block *new_block; + int ret; + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); + size_t new_size; + + ASSERT(*bpp == NULL); + + /* Allocate a new incore btree root block. */ + new_size = bbl->iroot_size(cur, nr_this_block, priv); + ifp->if_broot = kmem_zalloc(new_size, 0); + ifp->if_broot_bytes = (int)new_size; + ifp->if_flags |= XFS_IFBROOT; + + /* Initialize it and send it out. */ + xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot, + XFS_BUF_DADDR_NULL, cur->bc_btnum, level, + nr_this_block, cur->bc_ino.ip->i_ino, + cur->bc_flags); + + *bpp = NULL; + *blockp = ifp->if_broot; + xfs_btree_set_ptr_null(cur, ptrp); + return 0; + } + + /* Claim one of the caller's preallocated blocks. */ + xfs_btree_set_ptr_null(cur, &new_ptr); + ret = bbl->claim_block(cur, &new_ptr, priv); + if (ret) + return ret; + + ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr)); + + ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp); + if (ret) + return ret; + + /* + * The previous block (if any) is the left sibling of the new block, + * so set its right sibling pointer to the new block and drop it. + */ + if (*blockp) + xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB); + xfs_btree_bload_drop_buf(buffers_list, bpp); + + /* Initialize the new btree block. */ + xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block); + xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB); + + /* Set the out parameters. */ + *bpp = new_bp; + *blockp = new_block; + xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1); + return 0; +} + +/* Load one leaf block. */ +STATIC int +xfs_btree_bload_leaf( + struct xfs_btree_cur *cur, + unsigned int recs_this_block, + xfs_btree_bload_get_record_fn get_record, + struct xfs_btree_block *block, + void *priv) +{ + unsigned int j; + int ret; + + /* Fill the leaf block with records. */ + for (j = 1; j <= recs_this_block; j++) { + union xfs_btree_rec *block_rec; + + ret = get_record(cur, priv); + if (ret) + return ret; + block_rec = xfs_btree_rec_addr(cur, j, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return 0; +} + +/* + * Load one node block with key/ptr pairs. + * + * child_ptr must point to a block within the next level down in the tree. A + * key/ptr entry will be created in the new node block to the block pointed to + * by child_ptr. On exit, child_ptr points to the next block on the child + * level that needs processing. + */ +STATIC int +xfs_btree_bload_node( + struct xfs_btree_cur *cur, + unsigned int recs_this_block, + union xfs_btree_ptr *child_ptr, + struct xfs_btree_block *block) +{ + unsigned int j; + int ret; + + /* Fill the node block with keys and pointers. */ + for (j = 1; j <= recs_this_block; j++) { + union xfs_btree_key child_key; + union xfs_btree_ptr *block_ptr; + union xfs_btree_key *block_key; + struct xfs_btree_block *child_block; + struct xfs_buf *child_bp; + + ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr)); + + ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block, + &child_bp); + if (ret) + return ret; + + block_ptr = xfs_btree_ptr_addr(cur, j, block); + xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1); + + block_key = xfs_btree_key_addr(cur, j, block); + xfs_btree_get_keys(cur, child_block, &child_key); + xfs_btree_copy_keys(cur, block_key, &child_key, 1); + + xfs_btree_get_sibling(cur, child_block, child_ptr, + XFS_BB_RIGHTSIB); + xfs_buf_relse(child_bp); + } + + return 0; +} + +/* + * Compute the maximum number of records (or keyptrs) per block that we want to + * install at this level in the btree. Caller is responsible for having set + * @cur->bc_ino.forksize to the desired fork size, if appropriate. + */ +STATIC unsigned int +xfs_btree_bload_max_npb( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + unsigned int level) +{ + unsigned int ret; + + if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs) + return cur->bc_ops->get_dmaxrecs(cur, level); + + ret = cur->bc_ops->get_maxrecs(cur, level); + if (level == 0) + ret -= bbl->leaf_slack; + else + ret -= bbl->node_slack; + return ret; +} + +/* + * Compute the desired number of records (or keyptrs) per block that we want to + * install at this level in the btree, which must be somewhere between minrecs + * and max_npb. The caller is free to install fewer records per block. + */ +STATIC unsigned int +xfs_btree_bload_desired_npb( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + unsigned int level) +{ + unsigned int npb = xfs_btree_bload_max_npb(cur, bbl, level); + + /* Root blocks are not subject to minrecs rules. */ + if (level == cur->bc_nlevels - 1) + return max(1U, npb); + + return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb); +} + +/* + * Compute the number of records to be stored in each block at this level and + * the number of blocks for this level. For leaf levels, we must populate an + * empty root block even if there are no records, so we have to have at least + * one block. + */ +STATIC void +xfs_btree_bload_level_geometry( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + unsigned int level, + uint64_t nr_this_level, + unsigned int *avg_per_block, + uint64_t *blocks, + uint64_t *blocks_with_extra) +{ + uint64_t npb; + uint64_t dontcare; + unsigned int desired_npb; + unsigned int maxnr; + + maxnr = cur->bc_ops->get_maxrecs(cur, level); + + /* + * Compute the number of blocks we need to fill each block with the + * desired number of records/keyptrs per block. Because desired_npb + * could be minrecs, we use regular integer division (which rounds + * the block count down) so that in the next step the effective # of + * items per block will never be less than desired_npb. + */ + desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level); + *blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare); + *blocks = max(1ULL, *blocks); + + /* + * Compute the number of records that we will actually put in each + * block, assuming that we want to spread the records evenly between + * the blocks. Take care that the effective # of items per block (npb) + * won't exceed maxrecs even for the blocks that get an extra record, + * since desired_npb could be maxrecs, and in the previous step we + * rounded the block count down. + */ + npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra); + if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) { + (*blocks)++; + npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra); + } + + *avg_per_block = min_t(uint64_t, npb, nr_this_level); + + trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level, + *avg_per_block, desired_npb, *blocks, + *blocks_with_extra); +} + +/* + * Ensure a slack value is appropriate for the btree. + * + * If the slack value is negative, set slack so that we fill the block to + * halfway between minrecs and maxrecs. Make sure the slack is never so large + * that we can underflow minrecs. + */ +static void +xfs_btree_bload_ensure_slack( + struct xfs_btree_cur *cur, + int *slack, + int level) +{ + int maxr; + int minr; + + maxr = cur->bc_ops->get_maxrecs(cur, level); + minr = cur->bc_ops->get_minrecs(cur, level); + + /* + * If slack is negative, automatically set slack so that we load the + * btree block approximately halfway between minrecs and maxrecs. + * Generally, this will net us 75% loading. + */ + if (*slack < 0) + *slack = maxr - ((maxr + minr) >> 1); + + *slack = min(*slack, maxr - minr); +} + +/* + * Prepare a btree cursor for a bulk load operation by computing the geometry + * fields in bbl. Caller must ensure that the btree cursor is a staging + * cursor. This function can be called multiple times. + */ +int +xfs_btree_bload_compute_geometry( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + uint64_t nr_records) +{ + uint64_t nr_blocks = 0; + uint64_t nr_this_level; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + /* + * Make sure that the slack values make sense for traditional leaf and + * node blocks. Inode-rooted btrees will return different minrecs and + * maxrecs values for the root block (bc_nlevels == level - 1). We're + * checking levels 0 and 1 here, so set bc_nlevels such that the btree + * code doesn't interpret either as the root level. + */ + cur->bc_nlevels = XFS_BTREE_MAXLEVELS - 1; + xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0); + xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1); + + bbl->nr_records = nr_this_level = nr_records; + for (cur->bc_nlevels = 1; cur->bc_nlevels < XFS_BTREE_MAXLEVELS;) { + uint64_t level_blocks; + uint64_t dontcare64; + unsigned int level = cur->bc_nlevels - 1; + unsigned int avg_per_block; + + xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, + &avg_per_block, &level_blocks, &dontcare64); + + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + /* + * If all the items we want to store at this level + * would fit in the inode root block, then we have our + * btree root and are done. + * + * Note that bmap btrees forbid records in the root. + */ + if (level != 0 && nr_this_level <= avg_per_block) { + nr_blocks++; + break; + } + + /* + * Otherwise, we have to store all the items for this + * level in traditional btree blocks and therefore need + * another level of btree to point to those blocks. + * + * We have to re-compute the geometry for each level of + * an inode-rooted btree because the geometry differs + * between a btree root in an inode fork and a + * traditional btree block. + * + * This distinction is made in the btree code based on + * whether level == bc_nlevels - 1. Based on the + * previous root block size check against the root + * block geometry, we know that we aren't yet ready to + * populate the root. Increment bc_nevels and + * recalculate the geometry for a traditional + * block-based btree level. + */ + cur->bc_nlevels++; + xfs_btree_bload_level_geometry(cur, bbl, level, + nr_this_level, &avg_per_block, + &level_blocks, &dontcare64); + } else { + /* + * If all the items we want to store at this level + * would fit in a single root block, we're done. + */ + if (nr_this_level <= avg_per_block) { + nr_blocks++; + break; + } + + /* Otherwise, we need another level of btree. */ + cur->bc_nlevels++; + } + + nr_blocks += level_blocks; + nr_this_level = level_blocks; + } + + if (cur->bc_nlevels == XFS_BTREE_MAXLEVELS) + return -EOVERFLOW; + + bbl->btree_height = cur->bc_nlevels; + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + bbl->nr_blocks = nr_blocks - 1; + else + bbl->nr_blocks = nr_blocks; + return 0; +} + +/* Bulk load a btree given the parameters and geometry established in bbl. */ +int +xfs_btree_bload( + struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, + void *priv) +{ + struct list_head buffers_list; + union xfs_btree_ptr child_ptr; + union xfs_btree_ptr ptr; + struct xfs_buf *bp = NULL; + struct xfs_btree_block *block = NULL; + uint64_t nr_this_level = bbl->nr_records; + uint64_t blocks; + uint64_t i; + uint64_t blocks_with_extra; + uint64_t total_blocks = 0; + unsigned int avg_per_block; + unsigned int level = 0; + int ret; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + INIT_LIST_HEAD(&buffers_list); + cur->bc_nlevels = bbl->btree_height; + xfs_btree_set_ptr_null(cur, &child_ptr); + xfs_btree_set_ptr_null(cur, &ptr); + + xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, + &avg_per_block, &blocks, &blocks_with_extra); + + /* Load each leaf block. */ + for (i = 0; i < blocks; i++) { + unsigned int nr_this_block = avg_per_block; + + /* + * Due to rounding, btree blocks will not be evenly populated + * in most cases. blocks_with_extra tells us how many blocks + * will receive an extra record to distribute the excess across + * the current level as evenly as possible. + */ + if (i < blocks_with_extra) + nr_this_block++; + + ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level, + nr_this_block, &ptr, &bp, &block, priv); + if (ret) + goto out; + + trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr, + nr_this_block); + + ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record, + block, priv); + if (ret) + goto out; + + /* + * Record the leftmost leaf pointer so we know where to start + * with the first node level. + */ + if (i == 0) + xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1); + } + total_blocks += blocks; + xfs_btree_bload_drop_buf(&buffers_list, &bp); + + /* Populate the internal btree nodes. */ + for (level = 1; level < cur->bc_nlevels; level++) { + union xfs_btree_ptr first_ptr; + + nr_this_level = blocks; + block = NULL; + xfs_btree_set_ptr_null(cur, &ptr); + + xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, + &avg_per_block, &blocks, &blocks_with_extra); + + /* Load each node block. */ + for (i = 0; i < blocks; i++) { + unsigned int nr_this_block = avg_per_block; + + if (i < blocks_with_extra) + nr_this_block++; + + ret = xfs_btree_bload_prep_block(cur, bbl, + &buffers_list, level, nr_this_block, + &ptr, &bp, &block, priv); + if (ret) + goto out; + + trace_xfs_btree_bload_block(cur, level, i, blocks, + &ptr, nr_this_block); + + ret = xfs_btree_bload_node(cur, nr_this_block, + &child_ptr, block); + if (ret) + goto out; + + /* + * Record the leftmost node pointer so that we know + * where to start the next node level above this one. + */ + if (i == 0) + xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1); + } + total_blocks += blocks; + xfs_btree_bload_drop_buf(&buffers_list, &bp); + xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1); + } + + /* Initialize the new root. */ + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); + cur->bc_ino.ifake->if_levels = cur->bc_nlevels; + cur->bc_ino.ifake->if_blocks = total_blocks - 1; + } else { + cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s); + cur->bc_ag.afake->af_levels = cur->bc_nlevels; + cur->bc_ag.afake->af_blocks = total_blocks; + } + + /* + * Write the new blocks to disk. If the ordered list isn't empty after + * that, then something went wrong and we have to fail. This should + * never happen, but we'll check anyway. + */ + ret = xfs_buf_delwri_submit(&buffers_list); + if (ret) + goto out; + if (!list_empty(&buffers_list)) { + ASSERT(list_empty(&buffers_list)); + ret = -EIO; + } + +out: + xfs_buf_delwri_cancel(&buffers_list); + if (bp) + xfs_buf_relse(bp); + return ret; +} diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h new file mode 100644 index 000000000000..643f0f9b2994 --- /dev/null +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2020 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#ifndef __XFS_BTREE_STAGING_H__ +#define __XFS_BTREE_STAGING_H__ + +/* Fake root for an AG-rooted btree. */ +struct xbtree_afakeroot { + /* AG block number of the new btree root. */ + xfs_agblock_t af_root; + + /* Height of the new btree. */ + unsigned int af_levels; + + /* Number of blocks used by the btree. */ + unsigned int af_blocks; +}; + +/* Cursor interactions with with fake roots for AG-rooted btrees. */ +void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur, + struct xbtree_afakeroot *afake); +void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, + struct xfs_buf *agbp, const struct xfs_btree_ops *ops); + +/* Fake root for an inode-rooted btree. */ +struct xbtree_ifakeroot { + /* Fake inode fork. */ + struct xfs_ifork *if_fork; + + /* Number of blocks used by the btree. */ + int64_t if_blocks; + + /* Height of the new btree. */ + unsigned int if_levels; + + /* Number of bytes available for this fork in the inode. */ + unsigned int if_fork_size; + + /* Fork format. */ + unsigned int if_format; + + /* Number of records. */ + unsigned int if_extents; +}; + +/* Cursor interactions with with fake roots for inode-rooted btrees. */ +void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur, + struct xbtree_ifakeroot *ifake, + struct xfs_btree_ops **new_ops); +void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, + int whichfork, const struct xfs_btree_ops *ops); + +/* Bulk loading of staged btrees. */ +typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv); +typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, void *priv); +typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur, + unsigned int nr_this_level, void *priv); + +struct xfs_btree_bload { + /* + * This function will be called nr_records times to load records into + * the btree. The function does this by setting the cursor's bc_rec + * field in in-core format. Records must be returned in sort order. + */ + xfs_btree_bload_get_record_fn get_record; + + /* + * This function will be called nr_blocks times to obtain a pointer + * to a new btree block on disk. Callers must preallocate all space + * for the new btree before calling xfs_btree_bload, and this function + * is what claims that reservation. + */ + xfs_btree_bload_claim_block_fn claim_block; + + /* + * This function should return the size of the in-core btree root + * block. It is only necessary for XFS_BTREE_ROOT_IN_INODE btree + * types. + */ + xfs_btree_bload_iroot_size_fn iroot_size; + + /* + * The caller should set this to the number of records that will be + * stored in the new btree. + */ + uint64_t nr_records; + + /* + * Number of free records to leave in each leaf block. If the caller + * sets this to -1, the slack value will be calculated to be be halfway + * between maxrecs and minrecs. This typically leaves the block 75% + * full. Note that slack values are not enforced on inode root blocks. + */ + int leaf_slack; + + /* + * Number of free key/ptrs pairs to leave in each node block. This + * field has the same semantics as leaf_slack. + */ + int node_slack; + + /* + * The xfs_btree_bload_compute_geometry function will set this to the + * number of btree blocks needed to store nr_records records. + */ + uint64_t nr_blocks; + + /* + * The xfs_btree_bload_compute_geometry function will set this to the + * height of the new btree. + */ + unsigned int btree_height; +}; + +int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur, + struct xfs_btree_bload *bbl, uint64_t nr_records); +int xfs_btree_bload(struct xfs_btree_cur *cur, struct xfs_btree_bload *bbl, + void *priv); + +#endif /* __XFS_BTREE_STAGING_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 875e04f82541..897749c41f36 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -590,7 +590,7 @@ xfs_da3_split( node = oldblk->bp->b_addr; if (node->hdr.info.forw) { if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) { - xfs_buf_corruption_error(oldblk->bp); + xfs_buf_mark_corrupt(oldblk->bp); error = -EFSCORRUPTED; goto out; } @@ -603,7 +603,7 @@ xfs_da3_split( node = oldblk->bp->b_addr; if (node->hdr.info.back) { if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) { - xfs_buf_corruption_error(oldblk->bp); + xfs_buf_mark_corrupt(oldblk->bp); error = -EFSCORRUPTED; goto out; } @@ -1624,7 +1624,7 @@ xfs_da3_node_lookup_int( } if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) { - xfs_buf_corruption_error(blk->bp); + xfs_buf_mark_corrupt(blk->bp); return -EFSCORRUPTED; } @@ -1639,7 +1639,7 @@ xfs_da3_node_lookup_int( /* Tree taller than we can handle; bail out! */ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) { - xfs_buf_corruption_error(blk->bp); + xfs_buf_mark_corrupt(blk->bp); return -EFSCORRUPTED; } @@ -1647,7 +1647,7 @@ xfs_da3_node_lookup_int( if (blkno == args->geo->leafblk) expected_level = nodehdr.level - 1; else if (expected_level != nodehdr.level) { - xfs_buf_corruption_error(blk->bp); + xfs_buf_mark_corrupt(blk->bp); return -EFSCORRUPTED; } else expected_level--; @@ -1986,7 +1986,8 @@ xfs_da3_path_shift( ASSERT(path != NULL); ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); level = (path->active-1) - 1; /* skip bottom layer in path */ - for (blk = &path->blk[level]; level >= 0; blk--, level--) { + for (; level >= 0; level--) { + blk = &path->blk[level]; xfs_da3_node_hdr_from_disk(dp->i_mount, &nodehdr, blk->bp->b_addr); @@ -2520,8 +2521,10 @@ xfs_dabuf_map( */ if (nirecs > 1) { map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_NOFS); - if (!map) + if (!map) { + error = -ENOMEM; goto out_free_irecs; + } *mapp = map; } diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index 0f4fbb0889ff..53e503b6f186 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -57,9 +57,10 @@ typedef struct xfs_da_args { const uint8_t *name; /* string (maybe not NULL terminated) */ int namelen; /* length of string (maybe no NULL) */ uint8_t filetype; /* filetype of inode for directories */ - uint8_t *value; /* set of bytes (maybe contain NULLs) */ + void *value; /* set of bytes (maybe contain NULLs) */ int valuelen; /* length of value */ - int flags; /* argument flags (eg: ATTR_NOCREATE) */ + unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */ + unsigned int attr_flags; /* XATTR_{CREATE,REPLACE} */ xfs_dahash_t hashval; /* hash value of name */ xfs_ino_t inumber; /* input/output inode number */ struct xfs_inode *dp; /* directory inode to manipulate */ @@ -88,8 +89,7 @@ typedef struct xfs_da_args { #define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ #define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ #define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ -#define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */ -#define XFS_DA_OP_INCOMPLETE 0x0040 /* lookup INCOMPLETE attr keys */ +#define XFS_DA_OP_NOTIME 0x0020 /* don't update inode timestamps */ #define XFS_DA_OP_FLAGS \ { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ @@ -97,8 +97,7 @@ typedef struct xfs_da_args { { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ { XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \ - { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }, \ - { XFS_DA_OP_INCOMPLETE, "INCOMPLETE" } + { XFS_DA_OP_NOTIME, "NOTIME" } /* * Storage for holding state during Btree searches and split/join ops. diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 734837a9b51a..08c0a4d98b89 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -692,19 +692,7 @@ struct xfs_attr3_leafblock { #define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) #define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) #define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) - -/* - * Conversion macros for converting namespace bits from argument flags - * to ondisk flags. - */ -#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE) #define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) -#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK) -#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK) -#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\ - ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0)) -#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\ - ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0)) /* * Alignment for namelist and valuelist entries (since they are mixed diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index d6ced59b9567..1dbf2f980a26 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -114,6 +114,23 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .verify_struct = xfs_dir3_block_verify, }; +static xfs_failaddr_t +xfs_dir3_block_header_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = dp->i_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (be64_to_cpu(hdr3->owner) != dp->i_ino) + return __this_address; + } + + return NULL; +} + int xfs_dir3_block_read( struct xfs_trans *tp, @@ -121,12 +138,24 @@ xfs_dir3_block_read( struct xfs_buf **bpp) { struct xfs_mount *mp = dp->i_mount; + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, 0, bpp, XFS_DATA_FORK, &xfs_dir3_block_buf_ops); - if (!err && tp && *bpp) - xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); + if (err || !*bpp) + return err; + + /* Check things that we can't do in the verifier. */ + fa = xfs_dir3_block_header_check(dp, *bpp); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + return -EFSCORRUPTED; + } + + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); return err; } diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index b9eba8213180..375b3edb2ad2 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -394,6 +394,22 @@ static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { .verify_write = xfs_dir3_data_write_verify, }; +static xfs_failaddr_t +xfs_dir3_data_header_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = dp->i_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_data_hdr *hdr3 = bp->b_addr; + + if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) + return __this_address; + } + + return NULL; +} int xfs_dir3_data_read( @@ -403,12 +419,24 @@ xfs_dir3_data_read( unsigned int flags, struct xfs_buf **bpp) { + xfs_failaddr_t fa; int err; err = xfs_da_read_buf(tp, dp, bno, flags, bpp, XFS_DATA_FORK, &xfs_dir3_data_buf_ops); - if (!err && tp && *bpp) - xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); + if (err || !*bpp) + return err; + + /* Check things that we can't do in the verifier. */ + fa = xfs_dir3_data_header_check(dp, *bpp); + if (fa) { + __xfs_buf_mark_corrupt(*bpp, fa); + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + return -EFSCORRUPTED; + } + + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); return err; } diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index a131b520aac7..95d2a3f92d75 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -1383,7 +1383,7 @@ xfs_dir2_leaf_removename( ltp = xfs_dir2_leaf_tail_p(geo, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[db]) != oldbest) { - xfs_buf_corruption_error(lbp); + xfs_buf_mark_corrupt(lbp); return -EFSCORRUPTED; } /* diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index a0cc5e240306..6ac4aad98cd7 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -194,6 +194,8 @@ xfs_dir3_free_header_check( return __this_address; if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused)) return __this_address; + if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) + return __this_address; } else { struct xfs_dir2_free_hdr *hdr = bp->b_addr; @@ -226,8 +228,9 @@ __xfs_dir3_free_read( /* Check things that we can't do in the verifier. */ fa = xfs_dir3_free_header_check(dp, fbno, *bpp); if (fa) { - xfs_verifier_error(*bpp, -EFSCORRUPTED, fa); + __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); + *bpp = NULL; return -EFSCORRUPTED; } @@ -439,7 +442,7 @@ xfs_dir2_leaf_to_node( ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); if (be32_to_cpu(ltp->bestcount) > (uint)dp->i_d.di_size / args->geo->blksize) { - xfs_buf_corruption_error(lbp); + xfs_buf_mark_corrupt(lbp); return -EFSCORRUPTED; } @@ -513,7 +516,7 @@ xfs_dir2_leafn_add( * into other peoples memory */ if (index < 0) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } @@ -800,7 +803,7 @@ xfs_dir2_leafn_lookup_for_entry( xfs_dir3_leaf_check(dp, bp); if (leafhdr.count <= 0) { - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 77e9fa385980..045556e78ee2 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -497,6 +497,23 @@ static inline bool xfs_sb_version_hascrc(struct xfs_sb *sbp) return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } +/* + * v5 file systems support V3 inodes only, earlier file systems support + * v2 and v1 inodes. + */ +static inline bool xfs_sb_version_has_v3inode(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + +static inline bool xfs_dinode_good_version(struct xfs_sb *sbp, + uint8_t version) +{ + if (xfs_sb_version_has_v3inode(sbp)) + return version == 3; + return version == 1 || version == 2; +} + static inline bool xfs_sb_version_has_pquotino(struct xfs_sb *sbp) { return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; @@ -560,7 +577,6 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) #define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ #define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) -#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr)) #define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) #define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ @@ -707,7 +723,6 @@ typedef struct xfs_agf { /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) #define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) -#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr)) /* * Size of the unlinked inode hash table in the agi. @@ -775,7 +790,6 @@ typedef struct xfs_agi { /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) #define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) -#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr)) /* * The third a.g. block contains the a.g. freelist, an array @@ -783,21 +797,15 @@ typedef struct xfs_agi { */ #define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) #define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) -#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr)) +#define XFS_BUF_TO_AGFL(bp) ((struct xfs_agfl *)((bp)->b_addr)) -#define XFS_BUF_TO_AGFL_BNO(mp, bp) \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ - &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \ - (__be32 *)(bp)->b_addr) - -typedef struct xfs_agfl { +struct xfs_agfl { __be32 agfl_magicnum; __be32 agfl_seqno; uuid_t agfl_uuid; __be64 agfl_lsn; __be32 agfl_crc; - __be32 agfl_bno[]; /* actually xfs_agfl_size(mp) */ -} __attribute__((packed)) xfs_agfl_t; +} __attribute__((packed)); #define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) @@ -946,8 +954,12 @@ enum xfs_dinode_fmt { /* * Inode size for given fs. */ -#define XFS_LITINO(mp, version) \ - ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) +#define XFS_DINODE_SIZE(sbp) \ + (xfs_sb_version_has_v3inode(sbp) ? \ + sizeof(struct xfs_dinode) : \ + offsetof(struct xfs_dinode, di_crc)) +#define XFS_LITINO(mp) \ + ((mp)->m_sb.sb_inodesize - XFS_DINODE_SIZE(&(mp)->m_sb)) /* * Inode data & attribute fork sizes, per inode. @@ -956,13 +968,9 @@ enum xfs_dinode_fmt { #define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3)) #define XFS_DFORK_DSIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? \ - XFS_DFORK_BOFF(dip) : \ - XFS_LITINO(mp, (dip)->di_version)) + (XFS_DFORK_Q(dip) ? XFS_DFORK_BOFF(dip) : XFS_LITINO(mp)) #define XFS_DFORK_ASIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? \ - XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \ - 0) + (XFS_DFORK_Q(dip) ? XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : 0) #define XFS_DFORK_SIZE(dip,mp,w) \ ((w) == XFS_DATA_FORK ? \ XFS_DFORK_DSIZE(dip, mp) : \ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index ef95ca07d084..245188e4f6d3 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -568,10 +568,40 @@ typedef struct xfs_fsop_setdm_handlereq { struct fsdmidata __user *data; /* DMAPI data */ } xfs_fsop_setdm_handlereq_t; +/* + * Flags passed in xfs_attr_multiop.am_flags for the attr ioctl interface. + * + * NOTE: Must match the values declared in libattr without the XFS_IOC_ prefix. + */ +#define XFS_IOC_ATTR_ROOT 0x0002 /* use attrs in root namespace */ +#define XFS_IOC_ATTR_SECURE 0x0008 /* use attrs in security namespace */ +#define XFS_IOC_ATTR_CREATE 0x0010 /* fail if attr already exists */ +#define XFS_IOC_ATTR_REPLACE 0x0020 /* fail if attr does not exist */ + typedef struct xfs_attrlist_cursor { __u32 opaque[4]; } xfs_attrlist_cursor_t; +/* + * Define how lists of attribute names are returned to userspace from the + * XFS_IOC_ATTRLIST_BY_HANDLE ioctl. struct xfs_attrlist is the header at the + * beginning of the returned buffer, and a each entry in al_offset contains the + * relative offset of an xfs_attrlist_ent containing the actual entry. + * + * NOTE: struct xfs_attrlist must match struct attrlist defined in libattr, and + * struct xfs_attrlist_ent must match struct attrlist_ent defined in libattr. + */ +struct xfs_attrlist { + __s32 al_count; /* number of entries in attrlist */ + __s32 al_more; /* T/F: more attrs (do call again) */ + __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ +}; + +struct xfs_attrlist_ent { /* data from attr_list() */ + __u32 a_valuelen; /* number bytes in value of attr */ + char a_name[1]; /* attr name (NULL terminated) */ +}; + typedef struct xfs_fsop_attrlist_handlereq { struct xfs_fsop_handlereq hreq; /* handle interface structure */ struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */ @@ -589,7 +619,7 @@ typedef struct xfs_attr_multiop { void __user *am_attrname; void __user *am_attrvalue; __u32 am_length; - __u32 am_flags; + __u32 am_flags; /* XFS_IOC_ATTR_* */ } xfs_attr_multiop_t; typedef struct xfs_fsop_attrmulti_handlereq { diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index bf161e930f1d..7fcf62b324b0 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -105,7 +105,7 @@ xfs_inobt_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + xfs_agnumber_t agno = cur->bc_ag.agno; union xfs_btree_rec *rec; int error; uint64_t realfree; @@ -177,7 +177,7 @@ xfs_inobt_insert( xfs_btnum_t btnum) { struct xfs_btree_cur *cur; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agino_t thisino; int i; @@ -304,7 +304,7 @@ xfs_ialloc_inode_init( * That means for v3 inode we log the entire buffer rather than just the * inode cores. */ - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { version = 3; ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno)); @@ -339,7 +339,7 @@ xfs_ialloc_inode_init( xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) { int ioffset = i << mp->m_sb.sb_inodelog; - uint isize = xfs_dinode_size(version); + uint isize = XFS_DINODE_SIZE(&mp->m_sb); free = xfs_make_iptr(mp, fbuf, i); free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); @@ -525,7 +525,7 @@ xfs_inobt_insert_sprec( bool merge) /* merge or replace */ { struct xfs_btree_cur *cur; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); int error; int i; @@ -658,7 +658,7 @@ xfs_ialloc_ag_alloc( * chunk of inodes. If the filesystem is striped, this will fill * an entire stripe unit with inodes. */ - agi = XFS_BUF_TO_AGI(agbp); + agi = agbp->b_addr; newino = be32_to_cpu(agi->agi_newino); agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + @@ -1130,7 +1130,7 @@ xfs_dialloc_ag_inobt( xfs_ino_t *inop) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); @@ -1583,7 +1583,7 @@ xfs_dialloc_ag( xfs_ino_t *inop) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); @@ -1943,7 +1943,7 @@ xfs_difree_inobt( struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); struct xfs_perag *pag; struct xfs_btree_cur *cur; @@ -2079,7 +2079,7 @@ xfs_difree_finobt( xfs_agino_t agino, struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_agi *agi = agbp->b_addr; xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); struct xfs_btree_cur *cur; struct xfs_inobt_rec_incore rec; @@ -2489,9 +2489,8 @@ xfs_ialloc_log_agi( sizeof(xfs_agi_t) }; #ifdef DEBUG - xfs_agi_t *agi; /* allocation group header */ + struct xfs_agi *agi = bp->b_addr; - agi = XFS_BUF_TO_AGI(bp); ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); #endif @@ -2523,14 +2522,13 @@ xfs_agi_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; - struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + struct xfs_agi *agi = bp->b_addr; int i; if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (!xfs_log_check_lsn(mp, - be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn))) + if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn))) return __this_address; } @@ -2593,6 +2591,7 @@ xfs_agi_write_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; + struct xfs_agi *agi = bp->b_addr; xfs_failaddr_t fa; fa = xfs_agi_verify(bp); @@ -2605,7 +2604,7 @@ xfs_agi_write_verify( return; if (bip) - XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); + agi->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF); } @@ -2661,7 +2660,7 @@ xfs_ialloc_read_agi( if (error) return error; - agi = XFS_BUF_TO_AGI(*bpp); + agi = (*bpp)->b_addr; pag = xfs_perag_get(mp, agno); if (!pag->pagi_init) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); @@ -2873,7 +2872,7 @@ xfs_ialloc_setup_geometry( * cannot change the behavior. */ igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { int new_size = igeo->inode_cluster_size_raw; new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index b82992f795aa..b2c122ad8f0e 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -12,6 +12,7 @@ #include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" #include "xfs_alloc.h" @@ -20,7 +21,6 @@ #include "xfs_trans.h" #include "xfs_rmap.h" - STATIC int xfs_inobt_get_minrecs( struct xfs_btree_cur *cur, @@ -34,7 +34,7 @@ xfs_inobt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_ag.agbp, cur->bc_ag.agno, cur->bc_btnum); } @@ -44,8 +44,8 @@ xfs_inobt_set_root( union xfs_btree_ptr *nptr, int inc) /* level change */ { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agi *agi = agbp->b_addr; agi->agi_root = nptr->s; be32_add_cpu(&agi->agi_level, inc); @@ -58,8 +58,8 @@ xfs_finobt_set_root( union xfs_btree_ptr *nptr, int inc) /* level change */ { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agi *agi = agbp->b_addr; agi->agi_free_root = nptr->s; be32_add_cpu(&agi->agi_free_level, inc); @@ -83,7 +83,7 @@ __xfs_inobt_alloc_block( args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.oinfo = XFS_RMAP_OINFO_INOBT; - args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_ag.agno, sbno); args.minlen = 1; args.maxlen = 1; args.prod = 1; @@ -212,9 +212,9 @@ xfs_inobt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); + struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_root; } @@ -224,9 +224,9 @@ xfs_finobt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); + struct xfs_agi *agi = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agi->agi_seqno)); ptr->s = agi->agi_free_root; } @@ -400,32 +400,27 @@ static const struct xfs_btree_ops xfs_finobt_ops = { }; /* - * Allocate a new inode btree cursor. + * Initialize a new inode btree cursor. */ -struct xfs_btree_cur * /* new inode btree cursor */ -xfs_inobt_init_cursor( +static struct xfs_btree_cur * +xfs_inobt_init_common( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer for agi structure */ xfs_agnumber_t agno, /* allocation group number */ xfs_btnum_t btnum) /* ialloc or free ino btree */ { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); struct xfs_btree_cur *cur; cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); - cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_btnum = btnum; if (btnum == XFS_BTNUM_INO) { - cur->bc_nlevels = be32_to_cpu(agi->agi_level); - cur->bc_ops = &xfs_inobt_ops; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2); + cur->bc_ops = &xfs_inobt_ops; } else { - cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); - cur->bc_ops = &xfs_finobt_ops; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2); + cur->bc_ops = &xfs_finobt_ops; } cur->bc_blocklog = mp->m_sb.sb_blocklog; @@ -433,12 +428,75 @@ xfs_inobt_init_cursor( if (xfs_sb_version_hascrc(&mp->m_sb)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; + cur->bc_ag.agno = agno; + return cur; +} + +/* Create an inode btree cursor. */ +struct xfs_btree_cur * +xfs_inobt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = agbp->b_addr; + cur = xfs_inobt_init_common(mp, tp, agno, btnum); + if (btnum == XFS_BTNUM_INO) + cur->bc_nlevels = be32_to_cpu(agi->agi_level); + else + cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); + cur->bc_ag.agbp = agbp; return cur; } +/* Create an inode btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_inobt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + xfs_agnumber_t agno, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + + cur = xfs_inobt_init_common(mp, NULL, agno, btnum); + xfs_btree_stage_afakeroot(cur, afake); + return cur; +} + +/* + * Install a new inobt btree root. Caller is responsible for invalidating + * and freeing the old btree blocks. + */ +void +xfs_inobt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agi *agi = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + if (cur->bc_btnum == XFS_BTNUM_INO) { + agi->agi_root = cpu_to_be32(afake->af_root); + agi->agi_level = cpu_to_be32(afake->af_levels); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops); + } else { + agi->agi_free_root = cpu_to_be32(afake->af_root); + agi->agi_free_level = cpu_to_be32(afake->af_levels); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREE_ROOT | + XFS_AGI_FREE_LEVEL); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops); + } +} + /* * Calculate number of records in an inobt btree block. */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index 951305ecaae1..35bbd978c272 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -48,6 +48,9 @@ struct xfs_mount; extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, xfs_btnum_t); +struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, xfs_agnumber_t agno, + xfs_btnum_t btnum); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); /* ir_holemask to inode allocation bitmap conversion */ @@ -68,4 +71,7 @@ int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_btnum_t btnum, struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp); +void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 8afacfe4be0a..39c5a6e24915 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -44,17 +44,6 @@ xfs_inobp_check( } #endif -bool -xfs_dinode_good_version( - struct xfs_mount *mp, - __u8 version) -{ - if (xfs_sb_version_hascrc(&mp->m_sb)) - return version == 3; - - return version == 1 || version == 2; -} - /* * If we are doing readahead on an inode buffer, we might be in log recovery * reading an inode allocation buffer that hasn't yet been replayed, and hence @@ -93,7 +82,7 @@ xfs_inode_buf_verify( dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); unlinked_ino = be32_to_cpu(dip->di_next_unlinked); di_ok = xfs_verify_magic16(bp, dip->di_magic) && - xfs_dinode_good_version(mp, dip->di_version) && + xfs_dinode_good_version(&mp->m_sb, dip->di_version) && xfs_verify_agino_or_null(mp, agno, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { @@ -205,16 +194,14 @@ xfs_inode_from_disk( struct xfs_icdinode *to = &ip->i_d; struct inode *inode = VFS_I(ip); - /* * Convert v1 inodes immediately to v2 inode format as this is the * minimum inode version format we support in the rest of the code. + * They will also be unconditionally written back to disk as v2 inodes. */ - to->di_version = from->di_version; - if (to->di_version == 1) { + if (unlikely(from->di_version == 1)) { set_nlink(inode, be16_to_cpu(from->di_onlink)); to->di_projid = 0; - to->di_version = 2; } else { set_nlink(inode, be32_to_cpu(from->di_nlink)); to->di_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 | @@ -222,8 +209,8 @@ xfs_inode_from_disk( } to->di_format = from->di_format; - to->di_uid = be32_to_cpu(from->di_uid); - to->di_gid = be32_to_cpu(from->di_gid); + i_uid_write(inode, be32_to_cpu(from->di_uid)); + i_gid_write(inode, be32_to_cpu(from->di_gid)); to->di_flushiter = be16_to_cpu(from->di_flushiter); /* @@ -252,7 +239,7 @@ xfs_inode_from_disk( to->di_dmstate = be16_to_cpu(from->di_dmstate); to->di_flags = be16_to_cpu(from->di_flags); - if (to->di_version == 3) { + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { inode_set_iversion_queried(inode, be64_to_cpu(from->di_changecount)); to->di_crtime.tv_sec = be32_to_cpu(from->di_crtime.t_sec); @@ -274,10 +261,9 @@ xfs_inode_to_disk( to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); to->di_onlink = 0; - to->di_version = from->di_version; to->di_format = from->di_format; - to->di_uid = cpu_to_be32(from->di_uid); - to->di_gid = cpu_to_be32(from->di_gid); + to->di_uid = cpu_to_be32(i_uid_read(inode)); + to->di_gid = cpu_to_be32(i_gid_read(inode)); to->di_projid_lo = cpu_to_be16(from->di_projid & 0xffff); to->di_projid_hi = cpu_to_be16(from->di_projid >> 16); @@ -303,7 +289,8 @@ xfs_inode_to_disk( to->di_dmstate = cpu_to_be16(from->di_dmstate); to->di_flags = cpu_to_be16(from->di_flags); - if (from->di_version == 3) { + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + to->di_version = 3; to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.tv_sec); to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.tv_nsec); @@ -315,6 +302,7 @@ xfs_inode_to_disk( uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); to->di_flushiter = 0; } else { + to->di_version = 2; to->di_flushiter = cpu_to_be16(from->di_flushiter); } } @@ -428,7 +416,7 @@ xfs_dinode_verify_forkoff( case XFS_DINODE_FMT_LOCAL: /* fall through ... */ case XFS_DINODE_FMT_EXTENTS: /* fall through ... */ case XFS_DINODE_FMT_BTREE: - if (dip->di_forkoff >= (XFS_LITINO(mp, dip->di_version) >> 3)) + if (dip->di_forkoff >= (XFS_LITINO(mp) >> 3)) return __this_address; break; default: @@ -454,7 +442,7 @@ xfs_dinode_verify( /* Verify v3 integrity information first */ if (dip->di_version >= 3) { - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_sb_version_has_v3inode(&mp->m_sb)) return __this_address; if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF)) @@ -629,10 +617,9 @@ xfs_iread( /* shortcut IO on inode allocation if possible */ if ((iget_flags & XFS_IGET_CREATE) && - xfs_sb_version_hascrc(&mp->m_sb) && + xfs_sb_version_has_v3inode(&mp->m_sb) && !(mp->m_flags & XFS_MOUNT_IKEEP)) { VFS_I(ip)->i_generation = prandom_u32(); - ip->i_d.di_version = 3; return 0; } @@ -674,7 +661,6 @@ xfs_iread( * Partial initialisation of the in-core inode. Just the bits * that xfs_ialloc won't overwrite or relies on being correct. */ - ip->i_d.di_version = dip->di_version; VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen); ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); @@ -688,7 +674,6 @@ xfs_iread( VFS_I(ip)->i_mode = 0; } - ASSERT(ip->i_d.di_version >= 2); ip->i_delayed_blks = 0; /* diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index fd94b1078722..9b373dcf9e34 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -16,11 +16,8 @@ struct xfs_dinode; * format specific structures at the appropriate time. */ struct xfs_icdinode { - int8_t di_version; /* inode version */ int8_t di_format; /* format of di_c data */ uint16_t di_flushiter; /* incremented on flush */ - uint32_t di_uid; /* owner's user id */ - uint32_t di_gid; /* owner's group id */ uint32_t di_projid; /* owner's project id */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ @@ -61,8 +58,6 @@ void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); void xfs_log_dinode_to_disk(struct xfs_log_dinode *from, struct xfs_dinode *to); -bool xfs_dinode_good_version(struct xfs_mount *mp, __u8 version); - #if defined(DEBUG) void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); #else diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index ad2b9c313fd2..518c6f0ec3a6 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -183,7 +183,7 @@ xfs_iformat_local( */ if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { xfs_warn(ip->i_mount, - "corrupt inode %Lu (bad size %d for local fork, size = %d).", + "corrupt inode %Lu (bad size %d for local fork, size = %zd).", (unsigned long long) ip->i_ino, size, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); xfs_inode_verifier_error(ip, -EFSCORRUPTED, diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 500333d0101e..668ee942be22 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -46,14 +46,9 @@ struct xfs_ifork { (ip)->i_afp : \ (ip)->i_cowfp)) #define XFS_IFORK_DSIZE(ip) \ - (XFS_IFORK_Q(ip) ? \ - XFS_IFORK_BOFF(ip) : \ - XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version)) + (XFS_IFORK_Q(ip) ? XFS_IFORK_BOFF(ip) : XFS_LITINO((ip)->i_mount)) #define XFS_IFORK_ASIZE(ip) \ - (XFS_IFORK_Q(ip) ? \ - XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \ - XFS_IFORK_BOFF(ip) : \ - 0) + (XFS_IFORK_Q(ip) ? XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : 0) #define XFS_IFORK_SIZE(ip,w) \ ((w) == XFS_DATA_FORK ? \ XFS_IFORK_DSIZE(ip) : \ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 9bac0d2e56dc..e3400c9c71cd 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -424,12 +424,10 @@ struct xfs_log_dinode { /* structure must be padded to 64 bit alignment */ }; -static inline uint xfs_log_dinode_size(int version) -{ - if (version == 3) - return sizeof(struct xfs_log_dinode); - return offsetof(struct xfs_log_dinode, di_next_unlinked); -} +#define xfs_log_dinode_size(mp) \ + (xfs_sb_version_has_v3inode(&(mp)->m_sb) ? \ + sizeof(struct xfs_log_dinode) : \ + offsetof(struct xfs_log_dinode, di_next_unlinked)) /* * Buffer Log Format definitions diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 6e1665f2cb67..2076627243b0 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -46,7 +46,7 @@ xfs_refcount_lookup_le( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno, XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; @@ -63,7 +63,7 @@ xfs_refcount_lookup_ge( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno, XFS_LOOKUP_GE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; @@ -80,7 +80,7 @@ xfs_refcount_lookup_eq( xfs_agblock_t bno, int *stat) { - trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_ag.agno, bno, XFS_LOOKUP_LE); cur->bc_rec.rc.rc_startblock = bno; cur->bc_rec.rc.rc_blockcount = 0; @@ -108,7 +108,7 @@ xfs_refcount_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + xfs_agnumber_t agno = cur->bc_ag.agno; union xfs_btree_rec *rec; int error; xfs_agblock_t realstart; @@ -119,7 +119,7 @@ xfs_refcount_get_rec( xfs_refcount_btrec_to_irec(rec, irec); - agno = cur->bc_private.a.agno; + agno = cur->bc_ag.agno; if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) goto out_bad_rec; @@ -144,7 +144,7 @@ xfs_refcount_get_rec( if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) goto out_bad_rec; - trace_xfs_refcount_get(cur->bc_mp, cur->bc_private.a.agno, irec); + trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.agno, irec); return 0; out_bad_rec: @@ -169,14 +169,14 @@ xfs_refcount_update( union xfs_btree_rec rec; int error; - trace_xfs_refcount_update(cur->bc_mp, cur->bc_private.a.agno, irec); + trace_xfs_refcount_update(cur->bc_mp, cur->bc_ag.agno, irec); rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock); rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount); rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount); error = xfs_btree_update(cur, &rec); if (error) trace_xfs_refcount_update_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -193,7 +193,7 @@ xfs_refcount_insert( { int error; - trace_xfs_refcount_insert(cur->bc_mp, cur->bc_private.a.agno, irec); + trace_xfs_refcount_insert(cur->bc_mp, cur->bc_ag.agno, irec); cur->bc_rec.rc.rc_startblock = irec->rc_startblock; cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; cur->bc_rec.rc.rc_refcount = irec->rc_refcount; @@ -208,7 +208,7 @@ xfs_refcount_insert( out_error: if (error) trace_xfs_refcount_insert_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -234,7 +234,7 @@ xfs_refcount_delete( error = -EFSCORRUPTED; goto out_error; } - trace_xfs_refcount_delete(cur->bc_mp, cur->bc_private.a.agno, &irec); + trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.agno, &irec); error = xfs_btree_delete(cur, i); if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) { error = -EFSCORRUPTED; @@ -246,7 +246,7 @@ xfs_refcount_delete( out_error: if (error) trace_xfs_refcount_delete_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -366,7 +366,7 @@ xfs_refcount_split_extent( return 0; *shape_changed = true; - trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_ag.agno, &rcext, agbno); /* Establish the right extent. */ @@ -391,7 +391,7 @@ xfs_refcount_split_extent( out_error: trace_xfs_refcount_split_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -411,7 +411,7 @@ xfs_refcount_merge_center_extents( int found_rec; trace_xfs_refcount_merge_center_extents(cur->bc_mp, - cur->bc_private.a.agno, left, center, right); + cur->bc_ag.agno, left, center, right); /* * Make sure the center and right extents are not in the btree. @@ -468,7 +468,7 @@ xfs_refcount_merge_center_extents( out_error: trace_xfs_refcount_merge_center_extents_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -487,7 +487,7 @@ xfs_refcount_merge_left_extent( int found_rec; trace_xfs_refcount_merge_left_extent(cur->bc_mp, - cur->bc_private.a.agno, left, cleft); + cur->bc_ag.agno, left, cleft); /* If the extent at agbno (cleft) wasn't synthesized, remove it. */ if (cleft->rc_refcount > 1) { @@ -530,7 +530,7 @@ xfs_refcount_merge_left_extent( out_error: trace_xfs_refcount_merge_left_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -548,7 +548,7 @@ xfs_refcount_merge_right_extent( int found_rec; trace_xfs_refcount_merge_right_extent(cur->bc_mp, - cur->bc_private.a.agno, cright, right); + cur->bc_ag.agno, cright, right); /* * If the extent ending at agbno+aglen (cright) wasn't synthesized, @@ -594,7 +594,7 @@ xfs_refcount_merge_right_extent( out_error: trace_xfs_refcount_merge_right_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -679,13 +679,13 @@ xfs_refcount_find_left_extents( cleft->rc_blockcount = aglen; cleft->rc_refcount = 1; } - trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_ag.agno, left, cleft, agbno); return error; out_error: trace_xfs_refcount_find_left_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -768,13 +768,13 @@ xfs_refcount_find_right_extents( cright->rc_blockcount = aglen; cright->rc_refcount = 1; } - trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_ag.agno, cright, right, agbno + aglen); return error; out_error: trace_xfs_refcount_find_right_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -883,7 +883,7 @@ xfs_refcount_still_have_space( { unsigned long overhead; - overhead = cur->bc_private.a.priv.refc.shape_changes * + overhead = cur->bc_ag.refc.shape_changes * xfs_allocfree_log_count(cur->bc_mp, 1); overhead *= cur->bc_mp->m_sb.sb_blocksize; @@ -891,17 +891,17 @@ xfs_refcount_still_have_space( * Only allow 2 refcount extent updates per transaction if the * refcount continue update "error" has been injected. */ - if (cur->bc_private.a.priv.refc.nr_ops > 2 && + if (cur->bc_ag.refc.nr_ops > 2 && XFS_TEST_ERROR(false, cur->bc_mp, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) return false; - if (cur->bc_private.a.priv.refc.nr_ops == 0) + if (cur->bc_ag.refc.nr_ops == 0) return true; else if (overhead > cur->bc_tp->t_log_res) return false; return cur->bc_tp->t_log_res - overhead > - cur->bc_private.a.priv.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; + cur->bc_ag.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; } /* @@ -952,7 +952,7 @@ xfs_refcount_adjust_extents( ext.rc_startblock - *agbno); tmp.rc_refcount = 1 + adj; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &tmp); + cur->bc_ag.agno, &tmp); /* * Either cover the hole (increment) or @@ -968,10 +968,10 @@ xfs_refcount_adjust_extents( error = -EFSCORRUPTED; goto out_error; } - cur->bc_private.a.priv.refc.nr_ops++; + cur->bc_ag.refc.nr_ops++; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_private.a.agno, + cur->bc_ag.agno, tmp.rc_startblock); xfs_bmap_add_free(cur->bc_tp, fsbno, tmp.rc_blockcount, oinfo); @@ -998,12 +998,12 @@ xfs_refcount_adjust_extents( goto skip; ext.rc_refcount += adj; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &ext); + cur->bc_ag.agno, &ext); if (ext.rc_refcount > 1) { error = xfs_refcount_update(cur, &ext); if (error) goto out_error; - cur->bc_private.a.priv.refc.nr_ops++; + cur->bc_ag.refc.nr_ops++; } else if (ext.rc_refcount == 1) { error = xfs_refcount_delete(cur, &found_rec); if (error) @@ -1012,11 +1012,11 @@ xfs_refcount_adjust_extents( error = -EFSCORRUPTED; goto out_error; } - cur->bc_private.a.priv.refc.nr_ops++; + cur->bc_ag.refc.nr_ops++; goto advloop; } else { fsbno = XFS_AGB_TO_FSB(cur->bc_mp, - cur->bc_private.a.agno, + cur->bc_ag.agno, ext.rc_startblock); xfs_bmap_add_free(cur->bc_tp, fsbno, ext.rc_blockcount, oinfo); @@ -1035,7 +1035,7 @@ advloop: return error; out_error: trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1057,10 +1057,10 @@ xfs_refcount_adjust( *new_agbno = agbno; *new_aglen = aglen; if (adj == XFS_REFCOUNT_ADJUST_INCREASE) - trace_xfs_refcount_increase(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_increase(cur->bc_mp, cur->bc_ag.agno, agbno, aglen); else - trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_ag.agno, agbno, aglen); /* @@ -1088,7 +1088,7 @@ xfs_refcount_adjust( if (shape_changed) shape_changes++; if (shape_changes) - cur->bc_private.a.priv.refc.shape_changes++; + cur->bc_ag.refc.shape_changes++; /* Now that we've taken care of the ends, adjust the middle extents */ error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, @@ -1099,7 +1099,7 @@ xfs_refcount_adjust( return 0; out_error: - trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1115,7 +1115,7 @@ xfs_refcount_finish_one_cleanup( if (rcur == NULL) return; - agbp = rcur->bc_private.a.agbp; + agbp = rcur->bc_ag.agbp; xfs_btree_del_cursor(rcur, error); if (error) xfs_trans_brelse(tp, agbp); @@ -1165,9 +1165,9 @@ xfs_refcount_finish_one( * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_private.a.agno != agno) { - nr_ops = rcur->bc_private.a.priv.refc.nr_ops; - shape_changes = rcur->bc_private.a.priv.refc.shape_changes; + if (rcur != NULL && rcur->bc_ag.agno != agno) { + nr_ops = rcur->bc_ag.refc.nr_ops; + shape_changes = rcur->bc_ag.refc.shape_changes; xfs_refcount_finish_one_cleanup(tp, rcur, 0); rcur = NULL; *pcur = NULL; @@ -1183,8 +1183,8 @@ xfs_refcount_finish_one( error = -ENOMEM; goto out_cur; } - rcur->bc_private.a.priv.refc.nr_ops = nr_ops; - rcur->bc_private.a.priv.refc.shape_changes = shape_changes; + rcur->bc_ag.refc.nr_ops = nr_ops; + rcur->bc_ag.refc.shape_changes = shape_changes; } *pcur = rcur; @@ -1303,7 +1303,7 @@ xfs_refcount_find_shared( int have; int error; - trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_ag.agno, agbno, aglen); /* By default, skip the whole range */ @@ -1383,12 +1383,12 @@ xfs_refcount_find_shared( done: trace_xfs_refcount_find_shared_result(cur->bc_mp, - cur->bc_private.a.agno, *fbno, *flen); + cur->bc_ag.agno, *fbno, *flen); out_error: if (error) trace_xfs_refcount_find_shared_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1485,7 +1485,7 @@ xfs_refcount_adjust_cow_extents( tmp.rc_blockcount = aglen; tmp.rc_refcount = 1; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &tmp); + cur->bc_ag.agno, &tmp); error = xfs_refcount_insert(cur, &tmp, &found_tmp); @@ -1513,7 +1513,7 @@ xfs_refcount_adjust_cow_extents( ext.rc_refcount = 0; trace_xfs_refcount_modify_extent(cur->bc_mp, - cur->bc_private.a.agno, &ext); + cur->bc_ag.agno, &ext); error = xfs_refcount_delete(cur, &found_rec); if (error) goto out_error; @@ -1529,7 +1529,7 @@ xfs_refcount_adjust_cow_extents( return error; out_error: trace_xfs_refcount_modify_extent_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1575,7 +1575,7 @@ xfs_refcount_adjust_cow( return 0; out_error: - trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1589,7 +1589,7 @@ __xfs_refcount_cow_alloc( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno, + trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_ag.agno, agbno, aglen); /* Add refcount btree reservation */ @@ -1606,7 +1606,7 @@ __xfs_refcount_cow_free( xfs_agblock_t agbno, xfs_extlen_t aglen) { - trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno, + trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_ag.agno, agbno, aglen); /* Remove refcount btree reservation */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 38529dbacd55..7fd6044a4f78 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -12,6 +12,7 @@ #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_refcount_btree.h" #include "xfs_alloc.h" #include "xfs_error.h" @@ -25,7 +26,7 @@ xfs_refcountbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno); + cur->bc_ag.agbp, cur->bc_ag.agno); } STATIC void @@ -34,8 +35,8 @@ xfs_refcountbt_set_root( union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); @@ -57,8 +58,8 @@ xfs_refcountbt_alloc_block( union xfs_btree_ptr *new, int *stat) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; struct xfs_alloc_arg args; /* block allocation args */ int error; /* error return value */ @@ -66,7 +67,7 @@ xfs_refcountbt_alloc_block( args.tp = cur->bc_tp; args.mp = cur->bc_mp; args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, + args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, xfs_refc_block(args.mp)); args.oinfo = XFS_RMAP_OINFO_REFC; args.minlen = args.maxlen = args.prod = 1; @@ -75,13 +76,13 @@ xfs_refcountbt_alloc_block( error = xfs_alloc_vextent(&args); if (error) goto out_error; - trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.agno, args.agbno, 1); if (args.fsbno == NULLFSBLOCK) { *stat = 0; return 0; } - ASSERT(args.agno == cur->bc_private.a.agno); + ASSERT(args.agno == cur->bc_ag.agno); ASSERT(args.len == 1); new->s = cpu_to_be32(args.agbno); @@ -101,12 +102,12 @@ xfs_refcountbt_free_block( struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); int error; - trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.agno, XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); @@ -169,9 +170,9 @@ xfs_refcountbt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_refcount_root; } @@ -311,41 +312,90 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = { }; /* - * Allocate a new refcount btree cursor. + * Initialize a new refcount btree cursor. */ -struct xfs_btree_cur * -xfs_refcountbt_init_cursor( +static struct xfs_btree_cur * +xfs_refcountbt_init_common( struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_buf *agbp, xfs_agnumber_t agno) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); struct xfs_btree_cur *cur; ASSERT(agno != NULLAGNUMBER); ASSERT(agno < mp->m_sb.sb_agcount); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_btnum = XFS_BTNUM_REFC; cur->bc_blocklog = mp->m_sb.sb_blocklog; - cur->bc_ops = &xfs_refcountbt_ops; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2); - cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); - - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; + cur->bc_ag.agno = agno; cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_private.a.priv.refc.nr_ops = 0; - cur->bc_private.a.priv.refc.shape_changes = 0; + cur->bc_ag.refc.nr_ops = 0; + cur->bc_ag.refc.shape_changes = 0; + cur->bc_ops = &xfs_refcountbt_ops; + return cur; +} +/* Create a btree cursor. */ +struct xfs_btree_cur * +xfs_refcountbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xfs_btree_cur *cur; + + cur = xfs_refcountbt_init_common(mp, tp, agno); + cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); + cur->bc_ag.agbp = agbp; return cur; } +/* Create a btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_refcountbt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + xfs_agnumber_t agno) +{ + struct xfs_btree_cur *cur; + + cur = xfs_refcountbt_init_common(mp, NULL, agno); + xfs_btree_stage_afakeroot(cur, afake); + return cur; +} + +/* + * Swap in the new btree root. Once we pass this point the newly rebuilt btree + * is in place and we have to kill off all the old btree blocks. + */ +void +xfs_refcountbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + agf->agf_refcount_root = cpu_to_be32(afake->af_root); + agf->agf_refcount_level = cpu_to_be32(afake->af_levels); + agf->agf_refcount_blocks = cpu_to_be32(afake->af_blocks); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_REFCOUNT_BLOCKS | + XFS_AGF_REFCOUNT_ROOT | + XFS_AGF_REFCOUNT_LEVEL); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops); +} + /* * Calculate the number of records in a refcount btree block. */ @@ -420,7 +470,7 @@ xfs_refcountbt_calc_reserves( if (error) return error; - agf = XFS_BUF_TO_AGF(agbp); + agf = agbp->b_addr; agblocks = be32_to_cpu(agf->agf_length); tree_len = be32_to_cpu(agf->agf_refcount_blocks); xfs_trans_brelse(tp, agbp); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index ba416f71c824..69dc515db671 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -13,6 +13,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xbtree_afakeroot; /* * Btree block header size @@ -46,6 +47,8 @@ struct xfs_mount; extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno); +struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, xfs_agnumber_t agno); extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf); extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); @@ -58,4 +61,7 @@ extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); +void xfs_refcountbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); + #endif /* __XFS_REFCOUNT_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index ff9412f113c4..27c39268c31f 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -79,7 +79,7 @@ xfs_rmap_update( union xfs_btree_rec rec; int error; - trace_xfs_rmap_update(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_rmap_update(cur->bc_mp, cur->bc_ag.agno, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); @@ -91,7 +91,7 @@ xfs_rmap_update( error = xfs_btree_update(cur, &rec); if (error) trace_xfs_rmap_update_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -107,7 +107,7 @@ xfs_rmap_insert( int i; int error; - trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_private.a.agno, agbno, + trace_xfs_rmap_insert(rcur->bc_mp, rcur->bc_ag.agno, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); @@ -133,7 +133,7 @@ xfs_rmap_insert( done: if (error) trace_xfs_rmap_insert_error(rcur->bc_mp, - rcur->bc_private.a.agno, error, _RET_IP_); + rcur->bc_ag.agno, error, _RET_IP_); return error; } @@ -149,7 +149,7 @@ xfs_rmap_delete( int i; int error; - trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_private.a.agno, agbno, + trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_ag.agno, agbno, len, owner, offset, flags); error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); @@ -170,7 +170,7 @@ xfs_rmap_delete( done: if (error) trace_xfs_rmap_delete_error(rcur->bc_mp, - rcur->bc_private.a.agno, error, _RET_IP_); + rcur->bc_ag.agno, error, _RET_IP_); return error; } @@ -197,7 +197,7 @@ xfs_rmap_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_private.a.agno; + xfs_agnumber_t agno = cur->bc_ag.agno; union xfs_btree_rec *rec; int error; @@ -260,7 +260,7 @@ xfs_rmap_find_left_neighbor_helper( struct xfs_find_left_neighbor_info *info = priv; trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp, - cur->bc_private.a.agno, rec->rm_startblock, + cur->bc_ag.agno, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -312,7 +312,7 @@ xfs_rmap_find_left_neighbor( info.stat = stat; trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, - cur->bc_private.a.agno, bno, 0, owner, offset, flags); + cur->bc_ag.agno, bno, 0, owner, offset, flags); error = xfs_rmap_query_range(cur, &info.high, &info.high, xfs_rmap_find_left_neighbor_helper, &info); @@ -320,7 +320,7 @@ xfs_rmap_find_left_neighbor( error = 0; if (*stat) trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, irec->rm_startblock, + cur->bc_ag.agno, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); return error; @@ -336,7 +336,7 @@ xfs_rmap_lookup_le_range_helper( struct xfs_find_left_neighbor_info *info = priv; trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp, - cur->bc_private.a.agno, rec->rm_startblock, + cur->bc_ag.agno, rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, rec->rm_offset, rec->rm_flags); @@ -385,14 +385,14 @@ xfs_rmap_lookup_le_range( info.stat = stat; trace_xfs_rmap_lookup_le_range(cur->bc_mp, - cur->bc_private.a.agno, bno, 0, owner, offset, flags); + cur->bc_ag.agno, bno, 0, owner, offset, flags); error = xfs_rmap_query_range(cur, &info.high, &info.high, xfs_rmap_lookup_le_range_helper, &info); if (error == -ECANCELED) error = 0; if (*stat) trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, irec->rm_startblock, + cur->bc_ag.agno, irec->rm_startblock, irec->rm_blockcount, irec->rm_owner, irec->rm_offset, irec->rm_flags); return error; @@ -498,7 +498,7 @@ xfs_rmap_unmap( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* @@ -522,7 +522,7 @@ xfs_rmap_unmap( goto out_error; } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, ltrec.rm_startblock, + cur->bc_ag.agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); ltoff = ltrec.rm_offset; @@ -588,7 +588,7 @@ xfs_rmap_unmap( if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { /* exact match, simply remove the record from rmap tree */ - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); @@ -666,7 +666,7 @@ xfs_rmap_unmap( else cur->bc_rec.r.rm_offset = offset + len; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, cur->bc_rec.r.rm_startblock, cur->bc_rec.r.rm_blockcount, cur->bc_rec.r.rm_owner, @@ -678,11 +678,11 @@ xfs_rmap_unmap( } out_done: - trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_unmap_error(mp, cur->bc_private.a.agno, + trace_xfs_rmap_unmap_error(mp, cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -773,7 +773,7 @@ xfs_rmap_map( (flags & XFS_RMAP_BMBT_BLOCK); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); ASSERT(!xfs_rmap_should_skip_owner_update(oinfo)); @@ -795,7 +795,7 @@ xfs_rmap_map( goto out_error; } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, ltrec.rm_startblock, + cur->bc_ag.agno, ltrec.rm_startblock, ltrec.rm_blockcount, ltrec.rm_owner, ltrec.rm_offset, ltrec.rm_flags); @@ -831,7 +831,7 @@ xfs_rmap_map( goto out_error; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, gtrec.rm_startblock, + cur->bc_ag.agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, gtrec.rm_offset, gtrec.rm_flags); if (!xfs_rmap_is_mergeable(>rec, owner, flags)) @@ -870,7 +870,7 @@ xfs_rmap_map( * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr| */ ltrec.rm_blockcount += gtrec.rm_blockcount; - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, @@ -921,7 +921,7 @@ xfs_rmap_map( cur->bc_rec.r.rm_owner = owner; cur->bc_rec.r.rm_offset = offset; cur->bc_rec.r.rm_flags = flags; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, owner, offset, flags); error = xfs_btree_insert(cur, &i); if (error) @@ -932,11 +932,11 @@ xfs_rmap_map( } } - trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); out_error: if (error) - trace_xfs_rmap_map_error(mp, cur->bc_private.a.agno, + trace_xfs_rmap_map_error(mp, cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1010,7 +1010,7 @@ xfs_rmap_convert( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* @@ -1034,7 +1034,7 @@ xfs_rmap_convert( goto done; } trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, - cur->bc_private.a.agno, PREV.rm_startblock, + cur->bc_ag.agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); @@ -1076,7 +1076,7 @@ xfs_rmap_convert( goto done; } trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, LEFT.rm_startblock, + cur->bc_ag.agno, LEFT.rm_startblock, LEFT.rm_blockcount, LEFT.rm_owner, LEFT.rm_offset, LEFT.rm_flags); if (LEFT.rm_startblock + LEFT.rm_blockcount == bno && @@ -1114,7 +1114,7 @@ xfs_rmap_convert( goto done; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, RIGHT.rm_startblock, + cur->bc_ag.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); if (bno + len == RIGHT.rm_startblock && @@ -1132,7 +1132,7 @@ xfs_rmap_convert( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state, + trace_xfs_rmap_convert_state(mp, cur->bc_ag.agno, state, _RET_IP_); /* reset the cursor back to PREV */ @@ -1162,7 +1162,7 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); @@ -1180,7 +1180,7 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); @@ -1210,7 +1210,7 @@ xfs_rmap_convert( * Setting all of a previous oldext extent to newext. * The left neighbor is contiguous, the right is not. */ - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, PREV.rm_startblock, PREV.rm_blockcount, PREV.rm_owner, PREV.rm_offset, PREV.rm_flags); @@ -1247,7 +1247,7 @@ xfs_rmap_convert( error = -EFSCORRUPTED; goto done; } - trace_xfs_rmap_delete(mp, cur->bc_private.a.agno, + trace_xfs_rmap_delete(mp, cur->bc_ag.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); @@ -1326,7 +1326,7 @@ xfs_rmap_convert( NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) @@ -1383,7 +1383,7 @@ xfs_rmap_convert( NEW.rm_blockcount = len; NEW.rm_flags = newext; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) @@ -1414,7 +1414,7 @@ xfs_rmap_convert( NEW = PREV; NEW.rm_blockcount = offset - PREV.rm_offset; cur->bc_rec.r = NEW; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, NEW.rm_startblock, NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, NEW.rm_flags); @@ -1441,7 +1441,7 @@ xfs_rmap_convert( /* new middle extent - newext */ cur->bc_rec.r.rm_flags &= ~XFS_RMAP_UNWRITTEN; cur->bc_rec.r.rm_flags |= newext; - trace_xfs_rmap_insert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_insert(mp, cur->bc_ag.agno, bno, len, owner, offset, newext); error = xfs_btree_insert(cur, &i); if (error) @@ -1465,12 +1465,12 @@ xfs_rmap_convert( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); done: if (error) trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1506,7 +1506,7 @@ xfs_rmap_convert_shared( (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; new_endoff = offset + len; - trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* @@ -1573,7 +1573,7 @@ xfs_rmap_convert_shared( goto done; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, RIGHT.rm_startblock, + cur->bc_ag.agno, RIGHT.rm_startblock, RIGHT.rm_blockcount, RIGHT.rm_owner, RIGHT.rm_offset, RIGHT.rm_flags); if (xfs_rmap_is_mergeable(&RIGHT, owner, newext)) @@ -1589,7 +1589,7 @@ xfs_rmap_convert_shared( RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) state &= ~RMAP_RIGHT_CONTIG; - trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state, + trace_xfs_rmap_convert_state(mp, cur->bc_ag.agno, state, _RET_IP_); /* * Switch out based on the FILLING and CONTIG state bits. @@ -1880,12 +1880,12 @@ xfs_rmap_convert_shared( ASSERT(0); } - trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_convert_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); done: if (error) trace_xfs_rmap_convert_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -1923,7 +1923,7 @@ xfs_rmap_unmap_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* @@ -2072,12 +2072,12 @@ xfs_rmap_unmap_shared( goto out_error; } - trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_unmap_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); out_error: if (error) trace_xfs_rmap_unmap_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -2112,7 +2112,7 @@ xfs_rmap_map_shared( xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); if (unwritten) flags |= XFS_RMAP_UNWRITTEN; - trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); /* Is there a left record that abuts our range? */ @@ -2138,7 +2138,7 @@ xfs_rmap_map_shared( goto out_error; } trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, - cur->bc_private.a.agno, gtrec.rm_startblock, + cur->bc_ag.agno, gtrec.rm_startblock, gtrec.rm_blockcount, gtrec.rm_owner, gtrec.rm_offset, gtrec.rm_flags); @@ -2231,12 +2231,12 @@ xfs_rmap_map_shared( goto out_error; } - trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len, + trace_xfs_rmap_map_done(mp, cur->bc_ag.agno, bno, len, unwritten, oinfo); out_error: if (error) trace_xfs_rmap_map_error(cur->bc_mp, - cur->bc_private.a.agno, error, _RET_IP_); + cur->bc_ag.agno, error, _RET_IP_); return error; } @@ -2336,7 +2336,7 @@ xfs_rmap_finish_one_cleanup( if (rcur == NULL) return; - agbp = rcur->bc_private.a.agbp; + agbp = rcur->bc_ag.agbp; xfs_btree_del_cursor(rcur, error); if (error) xfs_trans_brelse(tp, agbp); @@ -2386,7 +2386,7 @@ xfs_rmap_finish_one( * the startblock, get one now. */ rcur = *pcur; - if (rcur != NULL && rcur->bc_private.a.agno != agno) { + if (rcur != NULL && rcur->bc_ag.agno != agno) { xfs_rmap_finish_one_cleanup(tp, rcur, 0); rcur = NULL; *pcur = NULL; @@ -2694,7 +2694,6 @@ struct xfs_rmap_key_state { uint64_t owner; uint64_t offset; unsigned int flags; - bool has_rmap; }; /* For each rmap given, figure out if it doesn't match the key we want. */ @@ -2709,7 +2708,6 @@ xfs_rmap_has_other_keys_helper( if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset && ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags) return 0; - rks->has_rmap = true; return -ECANCELED; } @@ -2731,7 +2729,7 @@ xfs_rmap_has_other_keys( int error; xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags); - rks.has_rmap = false; + *has_rmap = false; low.rm_startblock = bno; memset(&high, 0xFF, sizeof(high)); @@ -2739,11 +2737,12 @@ xfs_rmap_has_other_keys( error = xfs_rmap_query_range(cur, &low, &high, xfs_rmap_has_other_keys_helper, &rks); - if (error < 0) - return error; + if (error == -ECANCELED) { + *has_rmap = true; + return 0; + } - *has_rmap = rks.has_rmap; - return 0; + return error; } const struct xfs_owner_info XFS_RMAP_OINFO_SKIP_UPDATE = { diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index fc78efa52c94..b7c05314d07c 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -14,6 +14,7 @@ #include "xfs_trans.h" #include "xfs_alloc.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" #include "xfs_trace.h" @@ -51,7 +52,7 @@ xfs_rmapbt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agbp, cur->bc_private.a.agno); + cur->bc_ag.agbp, cur->bc_ag.agno); } STATIC void @@ -60,8 +61,8 @@ xfs_rmapbt_set_root( union xfs_btree_ptr *ptr, int inc) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); int btnum = cur->bc_btnum; struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); @@ -83,25 +84,25 @@ xfs_rmapbt_alloc_block( union xfs_btree_ptr *new, int *stat) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; int error; xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp, &bno, 1); if (error) return error; - trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_ag.agno, bno, 1); if (bno == NULLAGBLOCK) { *stat = 0; return 0; } - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agno, bno, 1, false); xfs_trans_agbtree_delta(cur->bc_tp, 1); @@ -109,7 +110,7 @@ xfs_rmapbt_alloc_block( be32_add_cpu(&agf->agf_rmap_blocks, 1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); - xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno); + xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_ag.agno); *stat = 1; return 0; @@ -120,13 +121,13 @@ xfs_rmapbt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - struct xfs_buf *agbp = cur->bc_private.a.agbp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_buf *agbp = cur->bc_ag.agbp; + struct xfs_agf *agf = agbp->b_addr; xfs_agblock_t bno; int error; bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); - trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno, + trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_ag.agno, bno, 1); be32_add_cpu(&agf->agf_rmap_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); @@ -138,7 +139,7 @@ xfs_rmapbt_free_block( XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); - xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno); + xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_ag.agno); return 0; } @@ -215,9 +216,9 @@ xfs_rmapbt_init_ptr_from_cur( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; - ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(cur->bc_ag.agno == be32_to_cpu(agf->agf_seqno)); ptr->s = agf->agf_roots[cur->bc_btnum]; } @@ -448,17 +449,12 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = { .recs_inorder = xfs_rmapbt_recs_inorder, }; -/* - * Allocate a new allocation btree cursor. - */ -struct xfs_btree_cur * -xfs_rmapbt_init_cursor( +static struct xfs_btree_cur * +xfs_rmapbt_init_common( struct xfs_mount *mp, struct xfs_trans *tp, - struct xfs_buf *agbp, xfs_agnumber_t agno) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); struct xfs_btree_cur *cur; cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); @@ -468,17 +464,68 @@ xfs_rmapbt_init_cursor( cur->bc_btnum = XFS_BTNUM_RMAP; cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); + cur->bc_ag.agno = agno; cur->bc_ops = &xfs_rmapbt_ops; + + return cur; +} + +/* Create a new reverse mapping btree cursor. */ +struct xfs_btree_cur * +xfs_rmapbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xfs_btree_cur *cur; + + cur = xfs_rmapbt_init_common(mp, tp, agno); cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); + cur->bc_ag.agbp = agbp; + return cur; +} - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; +/* Create a new reverse mapping btree cursor with a fake root for staging. */ +struct xfs_btree_cur * +xfs_rmapbt_stage_cursor( + struct xfs_mount *mp, + struct xbtree_afakeroot *afake, + xfs_agnumber_t agno) +{ + struct xfs_btree_cur *cur; + cur = xfs_rmapbt_init_common(mp, NULL, agno); + xfs_btree_stage_afakeroot(cur, afake); return cur; } /* + * Install a new reverse mapping btree root. Caller is responsible for + * invalidating and freeing the old btree blocks. + */ +void +xfs_rmapbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_agf *agf = agbp->b_addr; + struct xbtree_afakeroot *afake = cur->bc_ag.afake; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + + agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root); + agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels); + agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS | + XFS_AGF_RMAP_BLOCKS); + xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops); +} + +/* * Calculate number of records in an rmap btree block. */ int @@ -569,7 +616,7 @@ xfs_rmapbt_calc_reserves( if (error) return error; - agf = XFS_BUF_TO_AGF(agbp); + agf = agbp->b_addr; agblocks = be32_to_cpu(agf->agf_length); tree_len = be32_to_cpu(agf->agf_rmap_blocks); xfs_trans_brelse(tp, agbp); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index 820d668b063d..115c3455a734 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -9,6 +9,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; +struct xbtree_afakeroot; /* rmaps only exist on crc enabled filesystems */ #define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN @@ -43,6 +44,10 @@ struct xfs_mount; struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, xfs_agnumber_t agno); +struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp, + struct xbtree_afakeroot *afake, xfs_agnumber_t agno); +void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, struct xfs_buf *agbp); int xfs_rmapbt_maxrecs(int blocklen, int leaf); extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 2f60fc3c99a0..c526c5e5ab76 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -220,7 +220,7 @@ xfs_validate_sb_common( struct xfs_buf *bp, struct xfs_sb *sbp) { - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; uint32_t agcount = 0; uint32_t rem; @@ -328,6 +328,38 @@ xfs_validate_sb_common( return -EFSCORRUPTED; } + /* Validate the realtime geometry; stolen from xfs_repair */ + if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || + sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) { + xfs_notice(mp, + "realtime extent sanity check failed"); + return -EFSCORRUPTED; + } + + if (sbp->sb_rblocks == 0) { + if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || + sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) { + xfs_notice(mp, + "realtime zeroed geometry check failed"); + return -EFSCORRUPTED; + } + } else { + uint64_t rexts; + uint64_t rbmblocks; + + rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize); + rbmblocks = howmany_64(sbp->sb_rextents, + NBBY * sbp->sb_blocksize); + + if (sbp->sb_rextents != rexts || + sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) || + sbp->sb_rbmblocks != rbmblocks) { + xfs_notice(mp, + "realtime geometry sanity check failed"); + return -EFSCORRUPTED; + } + } + if (sbp->sb_unit) { if (!xfs_sb_version_hasdalign(sbp) || sbp->sb_unit > sbp->sb_width || @@ -681,7 +713,7 @@ xfs_sb_read_verify( { struct xfs_sb sb; struct xfs_mount *mp = bp->b_mount; - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; int error; /* @@ -707,7 +739,7 @@ xfs_sb_read_verify( * Check all the superblock fields. Don't byteswap the xquota flags * because _verify_common checks the on-disk values. */ - __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false); + __xfs_sb_from_disk(&sb, dsb, false); error = xfs_validate_sb_common(mp, bp, &sb); if (error) goto out_error; @@ -730,7 +762,7 @@ static void xfs_sb_quiet_read_verify( struct xfs_buf *bp) { - struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + struct xfs_dsb *dsb = bp->b_addr; if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) { /* XFS filesystem, verify noisily! */ @@ -748,13 +780,14 @@ xfs_sb_write_verify( struct xfs_sb sb; struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; + struct xfs_dsb *dsb = bp->b_addr; int error; /* * Check all the superblock fields. Don't byteswap the xquota flags * because _verify_common checks the on-disk values. */ - __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false); + __xfs_sb_from_disk(&sb, dsb, false); error = xfs_validate_sb_common(mp, bp, &sb); if (error) goto out_error; @@ -766,7 +799,7 @@ xfs_sb_write_verify( return; if (bip) - XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + dsb->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF); return; @@ -927,7 +960,7 @@ xfs_log_sb( mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1); } @@ -1007,7 +1040,7 @@ xfs_update_secondary_sbs( bp->b_ops = &xfs_sb_buf_ops; xfs_buf_oneshot(bp); xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_buf_delwri_queue(bp, &buffer_list); xfs_buf_relse(bp); diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 7a9c04920505..d1a0848cb52e 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -187,7 +187,7 @@ xfs_calc_inode_chunk_res( XFS_FSB_TO_B(mp, 1)); if (alloc) { /* icreate tx uses ordered buffers */ - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_sb_version_has_v3inode(&mp->m_sb)) return res; size = XFS_FSB_TO_B(mp, 1); } diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index ba0f747c82e8..e9bcf1faa183 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -92,7 +92,7 @@ xchk_superblock( if (!xchk_process_error(sc, agno, XFS_SB_BLOCK(mp), &error)) return error; - sb = XFS_BUF_TO_SBP(bp); + sb = bp->b_addr; /* * Verify the geometries match. Fields that are permanently @@ -358,7 +358,7 @@ static inline void xchk_agf_xref_freeblks( struct xfs_scrub *sc) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; xfs_extlen_t blocks = 0; int error; @@ -378,7 +378,7 @@ static inline void xchk_agf_xref_cntbt( struct xfs_scrub *sc) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; xfs_agblock_t agbno; xfs_extlen_t blocks; int have; @@ -410,7 +410,7 @@ STATIC void xchk_agf_xref_btreeblks( struct xfs_scrub *sc) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; struct xfs_mount *mp = sc->mp; xfs_agblock_t blocks; xfs_agblock_t btreeblks; @@ -456,7 +456,7 @@ static inline void xchk_agf_xref_refcblks( struct xfs_scrub *sc) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; xfs_agblock_t blocks; int error; @@ -525,7 +525,7 @@ xchk_agf( goto out; xchk_buffer_recheck(sc, sc->sa.agf_bp); - agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + agf = sc->sa.agf_bp->b_addr; /* Check the AG length */ eoag = be32_to_cpu(agf->agf_length); @@ -711,7 +711,7 @@ xchk_agfl( goto out; /* Allocate buffer to ensure uniqueness of AGFL entries. */ - agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + agf = sc->sa.agf_bp->b_addr; agflcount = be32_to_cpu(agf->agf_flcount); if (agflcount > xfs_agfl_size(sc->mp)) { xchk_block_set_corrupt(sc, sc->sa.agf_bp); @@ -728,7 +728,7 @@ xchk_agfl( } /* Check the blocks in the AGFL. */ - error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp), + error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr, sc->sa.agfl_bp, xchk_agfl_block, &sai); if (error == -ECANCELED) { error = 0; @@ -765,7 +765,7 @@ static inline void xchk_agi_xref_icounts( struct xfs_scrub *sc) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(sc->sa.agi_bp); + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; xfs_agino_t icount; xfs_agino_t freecount; int error; @@ -834,7 +834,7 @@ xchk_agi( goto out; xchk_buffer_recheck(sc, sc->sa.agi_bp); - agi = XFS_BUF_TO_AGI(sc->sa.agi_bp); + agi = sc->sa.agi_bp->b_addr; /* Check the AG length */ eoag = be32_to_cpu(agi->agi_length); diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index d5e6db9af434..bca2ab1d4be9 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -49,7 +49,7 @@ xrep_superblock( /* Copy AG 0's superblock to this one. */ xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); /* Write this to disk. */ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); @@ -140,7 +140,7 @@ xrep_agf_find_btrees( struct xrep_find_ag_btree *fab, struct xfs_buf *agfl_bp) { - struct xfs_agf *old_agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *old_agf = agf_bp->b_addr; int error; /* Go find the root data. */ @@ -181,7 +181,7 @@ xrep_agf_init_header( struct xfs_agf *old_agf) { struct xfs_mount *mp = sc->mp; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *agf = agf_bp->b_addr; memcpy(old_agf, agf, sizeof(*old_agf)); memset(agf, 0, BBTOB(agf_bp->b_length)); @@ -238,7 +238,7 @@ xrep_agf_calc_from_btrees( { struct xrep_agf_allocbt raa = { .sc = sc }; struct xfs_btree_cur *cur = NULL; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *agf = agf_bp->b_addr; struct xfs_mount *mp = sc->mp; xfs_agblock_t btreeblks; xfs_agblock_t blocks; @@ -302,7 +302,7 @@ xrep_agf_commit_new( struct xfs_buf *agf_bp) { struct xfs_perag *pag; - struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *agf = agf_bp->b_addr; /* Trigger fdblocks recalculation */ xfs_force_summary_recalc(sc->mp); @@ -376,7 +376,7 @@ xrep_agf( if (error) return error; agf_bp->b_ops = &xfs_agf_buf_ops; - agf = XFS_BUF_TO_AGF(agf_bp); + agf = agf_bp->b_addr; /* * Load the AGFL so that we can screen out OWN_AG blocks that are on @@ -395,7 +395,7 @@ xrep_agf( * Spot-check the AGFL blocks; if they're obviously corrupt then * there's nothing we can do but bail out. */ - error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(agf_bp), agfl_bp, + error = xfs_agfl_walk(sc->mp, agf_bp->b_addr, agfl_bp, xrep_agf_check_agfl_block, sc); if (error) return error; @@ -429,10 +429,10 @@ out_revert: struct xrep_agfl { /* Bitmap of other OWN_AG metadata blocks. */ - struct xfs_bitmap agmetablocks; + struct xbitmap agmetablocks; /* Bitmap of free space. */ - struct xfs_bitmap *freesp; + struct xbitmap *freesp; struct xfs_scrub *sc; }; @@ -453,14 +453,14 @@ xrep_agfl_walk_rmap( /* Record all the OWN_AG blocks. */ if (rec->rm_owner == XFS_RMAP_OWN_AG) { - fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, + fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, rec->rm_startblock); - error = xfs_bitmap_set(ra->freesp, fsb, rec->rm_blockcount); + error = xbitmap_set(ra->freesp, fsb, rec->rm_blockcount); if (error) return error; } - return xfs_bitmap_set_btcur_path(&ra->agmetablocks, cur); + return xbitmap_set_btcur_path(&ra->agmetablocks, cur); } /* @@ -476,19 +476,17 @@ STATIC int xrep_agfl_collect_blocks( struct xfs_scrub *sc, struct xfs_buf *agf_bp, - struct xfs_bitmap *agfl_extents, + struct xbitmap *agfl_extents, xfs_agblock_t *flcount) { struct xrep_agfl ra; struct xfs_mount *mp = sc->mp; struct xfs_btree_cur *cur; - struct xfs_bitmap_range *br; - struct xfs_bitmap_range *n; int error; ra.sc = sc; ra.freesp = agfl_extents; - xfs_bitmap_init(&ra.agmetablocks); + xbitmap_init(&ra.agmetablocks); /* Find all space used by the free space btrees & rmapbt. */ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno); @@ -500,7 +498,7 @@ xrep_agfl_collect_blocks( /* Find all blocks currently being used by the bnobt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, XFS_BTNUM_BNO); - error = xfs_bitmap_set_btblocks(&ra.agmetablocks, cur); + error = xbitmap_set_btblocks(&ra.agmetablocks, cur); if (error) goto err; xfs_btree_del_cursor(cur, error); @@ -508,7 +506,7 @@ xrep_agfl_collect_blocks( /* Find all blocks currently being used by the cntbt. */ cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, XFS_BTNUM_CNT); - error = xfs_bitmap_set_btblocks(&ra.agmetablocks, cur); + error = xbitmap_set_btblocks(&ra.agmetablocks, cur); if (error) goto err; @@ -518,8 +516,8 @@ xrep_agfl_collect_blocks( * Drop the freesp meta blocks that are in use by btrees. * The remaining blocks /should/ be AGFL blocks. */ - error = xfs_bitmap_disunion(agfl_extents, &ra.agmetablocks); - xfs_bitmap_destroy(&ra.agmetablocks); + error = xbitmap_disunion(agfl_extents, &ra.agmetablocks); + xbitmap_destroy(&ra.agmetablocks); if (error) return error; @@ -527,18 +525,12 @@ xrep_agfl_collect_blocks( * Calculate the new AGFL size. If we found more blocks than fit in * the AGFL we'll free them later. */ - *flcount = 0; - for_each_xfs_bitmap_extent(br, n, agfl_extents) { - *flcount += br->len; - if (*flcount > xfs_agfl_size(mp)) - break; - } - if (*flcount > xfs_agfl_size(mp)) - *flcount = xfs_agfl_size(mp); + *flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents), + xfs_agfl_size(mp)); return 0; err: - xfs_bitmap_destroy(&ra.agmetablocks); + xbitmap_destroy(&ra.agmetablocks); xfs_btree_del_cursor(cur, error); return error; } @@ -550,7 +542,7 @@ xrep_agfl_update_agf( struct xfs_buf *agf_bp, xfs_agblock_t flcount) { - struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp); + struct xfs_agf *agf = agf_bp->b_addr; ASSERT(flcount <= xfs_agfl_size(sc->mp)); @@ -573,13 +565,13 @@ STATIC void xrep_agfl_init_header( struct xfs_scrub *sc, struct xfs_buf *agfl_bp, - struct xfs_bitmap *agfl_extents, + struct xbitmap *agfl_extents, xfs_agblock_t flcount) { struct xfs_mount *mp = sc->mp; __be32 *agfl_bno; - struct xfs_bitmap_range *br; - struct xfs_bitmap_range *n; + struct xbitmap_range *br; + struct xbitmap_range *n; struct xfs_agfl *agfl; xfs_agblock_t agbno; unsigned int fl_off; @@ -602,8 +594,8 @@ xrep_agfl_init_header( * step. */ fl_off = 0; - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agfl_bp); - for_each_xfs_bitmap_extent(br, n, agfl_extents) { + agfl_bno = xfs_buf_to_agfl_bno(agfl_bp); + for_each_xbitmap_extent(br, n, agfl_extents) { agbno = XFS_FSB_TO_AGBNO(mp, br->start); trace_xrep_agfl_insert(mp, sc->sa.agno, agbno, br->len); @@ -637,7 +629,7 @@ int xrep_agfl( struct xfs_scrub *sc) { - struct xfs_bitmap agfl_extents; + struct xbitmap agfl_extents; struct xfs_mount *mp = sc->mp; struct xfs_buf *agf_bp; struct xfs_buf *agfl_bp; @@ -649,7 +641,7 @@ xrep_agfl( return -EOPNOTSUPP; xchk_perag_get(sc->mp, &sc->sa); - xfs_bitmap_init(&agfl_extents); + xbitmap_init(&agfl_extents); /* * Read the AGF so that we can query the rmapbt. We hope that there's @@ -696,10 +688,10 @@ xrep_agfl( goto err; /* Dump any AGFL overflow. */ - return xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, + error = xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, XFS_AG_RESV_AGFL); err: - xfs_bitmap_destroy(&agfl_extents); + xbitmap_destroy(&agfl_extents); return error; } @@ -761,7 +753,7 @@ xrep_agi_init_header( struct xfs_buf *agi_bp, struct xfs_agi *old_agi) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp); + struct xfs_agi *agi = agi_bp->b_addr; struct xfs_mount *mp = sc->mp; memcpy(old_agi, agi, sizeof(*old_agi)); @@ -807,7 +799,7 @@ xrep_agi_calc_from_btrees( struct xfs_buf *agi_bp) { struct xfs_btree_cur *cur; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp); + struct xfs_agi *agi = agi_bp->b_addr; struct xfs_mount *mp = sc->mp; xfs_agino_t count; xfs_agino_t freecount; @@ -835,7 +827,7 @@ xrep_agi_commit_new( struct xfs_buf *agi_bp) { struct xfs_perag *pag; - struct xfs_agi *agi = XFS_BUF_TO_AGI(agi_bp); + struct xfs_agi *agi = agi_bp->b_addr; /* Trigger inode count recalculation */ xfs_force_summary_recalc(sc->mp); @@ -892,7 +884,7 @@ xrep_agi( if (error) return error; agi_bp->b_ops = &xfs_agi_buf_ops; - agi = XFS_BUF_TO_AGI(agi_bp); + agi = agi_bp->b_addr; /* Find the AGI btree roots. */ error = xrep_agi_find_btrees(sc, fab); diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 5533e48e605d..73d924e47565 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -94,7 +94,7 @@ xchk_allocbt_rec( union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agblock_t bno; xfs_extlen_t len; diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index d9f0dd444b80..9faddb334a2c 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -98,7 +98,7 @@ struct xchk_xattr { /* * Check that an extended attribute key can be looked up by hash. * - * We use the XFS attribute list iterator (i.e. xfs_attr_list_int_ilocked) + * We use the XFS attribute list iterator (i.e. xfs_attr_list_ilocked) * to call this function for every attribute key in an inode. Once * we're here, we load the attribute value to see if any errors happen, * or if we get more or less data than we expected. @@ -147,11 +147,8 @@ xchk_xattr_listent( return; } - args.flags = ATTR_KERNOTIME; - if (flags & XFS_ATTR_ROOT) - args.flags |= ATTR_ROOT; - else if (flags & XFS_ATTR_SECURE) - args.flags |= ATTR_SECURE; + args.op_flags = XFS_DA_OP_NOTIME; + args.attr_filter = flags & XFS_ATTR_NSP_ONDISK_MASK; args.geo = context->dp->i_mount->m_attr_geo; args.whichfork = XFS_ATTR_FORK; args.dp = context->dp; @@ -162,7 +159,10 @@ xchk_xattr_listent( args.value = xchk_xattr_valuebuf(sx->sc); args.valuelen = valuelen; - error = xfs_attr_get_ilocked(context->dp, &args); + error = xfs_attr_get_ilocked(&args); + /* ENODATA means the hash lookup failed and the attr is bad */ + if (error == -ENODATA) + error = -EFSCORRUPTED; if (!xchk_fblock_process_error(sx->sc, XFS_ATTR_FORK, args.blkno, &error)) goto fail_xref; @@ -474,7 +474,6 @@ xchk_xattr( struct xfs_scrub *sc) { struct xchk_xattr sx; - struct attrlist_cursor_kern cursor = { 0 }; xfs_dablk_t last_checked = -1U; int error = 0; @@ -493,11 +492,10 @@ xchk_xattr( /* Check that every attr key can also be looked up by hash. */ sx.context.dp = sc->ip; - sx.context.cursor = &cursor; sx.context.resynch = 1; sx.context.put_listent = xchk_xattr_listent; sx.context.tp = sc->tp; - sx.context.flags = ATTR_INCOMPLETE; + sx.context.allow_incomplete = true; sx.sc = sc; /* @@ -516,7 +514,7 @@ xchk_xattr( * iteration, which doesn't really follow the usual buffer * locking order. */ - error = xfs_attr_list_int_ilocked(&sx.context); + error = xfs_attr_list_ilocked(&sx.context); if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error)) goto out; diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index 18a684e18a69..f88694f22d05 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -18,14 +18,14 @@ * This is the logical equivalent of bitmap |= mask(start, len). */ int -xfs_bitmap_set( - struct xfs_bitmap *bitmap, +xbitmap_set( + struct xbitmap *bitmap, uint64_t start, uint64_t len) { - struct xfs_bitmap_range *bmr; + struct xbitmap_range *bmr; - bmr = kmem_alloc(sizeof(struct xfs_bitmap_range), KM_MAYFAIL); + bmr = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL); if (!bmr) return -ENOMEM; @@ -39,13 +39,13 @@ xfs_bitmap_set( /* Free everything related to this bitmap. */ void -xfs_bitmap_destroy( - struct xfs_bitmap *bitmap) +xbitmap_destroy( + struct xbitmap *bitmap) { - struct xfs_bitmap_range *bmr; - struct xfs_bitmap_range *n; + struct xbitmap_range *bmr; + struct xbitmap_range *n; - for_each_xfs_bitmap_extent(bmr, n, bitmap) { + for_each_xbitmap_extent(bmr, n, bitmap) { list_del(&bmr->list); kmem_free(bmr); } @@ -53,24 +53,24 @@ xfs_bitmap_destroy( /* Set up a per-AG block bitmap. */ void -xfs_bitmap_init( - struct xfs_bitmap *bitmap) +xbitmap_init( + struct xbitmap *bitmap) { INIT_LIST_HEAD(&bitmap->list); } /* Compare two btree extents. */ static int -xfs_bitmap_range_cmp( +xbitmap_range_cmp( void *priv, struct list_head *a, struct list_head *b) { - struct xfs_bitmap_range *ap; - struct xfs_bitmap_range *bp; + struct xbitmap_range *ap; + struct xbitmap_range *bp; - ap = container_of(a, struct xfs_bitmap_range, list); - bp = container_of(b, struct xfs_bitmap_range, list); + ap = container_of(a, struct xbitmap_range, list); + bp = container_of(b, struct xbitmap_range, list); if (ap->start > bp->start) return 1; @@ -96,14 +96,14 @@ xfs_bitmap_range_cmp( #define LEFT_ALIGNED (1 << 0) #define RIGHT_ALIGNED (1 << 1) int -xfs_bitmap_disunion( - struct xfs_bitmap *bitmap, - struct xfs_bitmap *sub) +xbitmap_disunion( + struct xbitmap *bitmap, + struct xbitmap *sub) { struct list_head *lp; - struct xfs_bitmap_range *br; - struct xfs_bitmap_range *new_br; - struct xfs_bitmap_range *sub_br; + struct xbitmap_range *br; + struct xbitmap_range *new_br; + struct xbitmap_range *sub_br; uint64_t sub_start; uint64_t sub_len; int state; @@ -113,8 +113,8 @@ xfs_bitmap_disunion( return 0; ASSERT(!list_empty(&sub->list)); - list_sort(NULL, &bitmap->list, xfs_bitmap_range_cmp); - list_sort(NULL, &sub->list, xfs_bitmap_range_cmp); + list_sort(NULL, &bitmap->list, xbitmap_range_cmp); + list_sort(NULL, &sub->list, xbitmap_range_cmp); /* * Now that we've sorted both lists, we iterate bitmap once, rolling @@ -124,11 +124,11 @@ xfs_bitmap_disunion( * list traversal is similar to merge sort, but we're deleting * instead. In this manner we avoid O(n^2) operations. */ - sub_br = list_first_entry(&sub->list, struct xfs_bitmap_range, + sub_br = list_first_entry(&sub->list, struct xbitmap_range, list); lp = bitmap->list.next; while (lp != &bitmap->list) { - br = list_entry(lp, struct xfs_bitmap_range, list); + br = list_entry(lp, struct xbitmap_range, list); /* * Advance sub_br and/or br until we find a pair that @@ -181,7 +181,7 @@ xfs_bitmap_disunion( * Deleting from the middle: add the new right extent * and then shrink the left extent. */ - new_br = kmem_alloc(sizeof(struct xfs_bitmap_range), + new_br = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL); if (!new_br) { error = -ENOMEM; @@ -247,8 +247,8 @@ out: * blocks going from the leaf towards the root. */ int -xfs_bitmap_set_btcur_path( - struct xfs_bitmap *bitmap, +xbitmap_set_btcur_path( + struct xbitmap *bitmap, struct xfs_btree_cur *cur) { struct xfs_buf *bp; @@ -261,7 +261,7 @@ xfs_bitmap_set_btcur_path( if (!bp) continue; fsb = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn); - error = xfs_bitmap_set(bitmap, fsb, 1); + error = xbitmap_set(bitmap, fsb, 1); if (error) return error; } @@ -271,12 +271,12 @@ xfs_bitmap_set_btcur_path( /* Collect a btree's block in the bitmap. */ STATIC int -xfs_bitmap_collect_btblock( +xbitmap_collect_btblock( struct xfs_btree_cur *cur, int level, void *priv) { - struct xfs_bitmap *bitmap = priv; + struct xbitmap *bitmap = priv; struct xfs_buf *bp; xfs_fsblock_t fsbno; @@ -285,15 +285,30 @@ xfs_bitmap_collect_btblock( return 0; fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn); - return xfs_bitmap_set(bitmap, fsbno, 1); + return xbitmap_set(bitmap, fsbno, 1); } /* Walk the btree and mark the bitmap wherever a btree block is found. */ int -xfs_bitmap_set_btblocks( - struct xfs_bitmap *bitmap, +xbitmap_set_btblocks( + struct xbitmap *bitmap, struct xfs_btree_cur *cur) { - return xfs_btree_visit_blocks(cur, xfs_bitmap_collect_btblock, + return xfs_btree_visit_blocks(cur, xbitmap_collect_btblock, XFS_BTREE_VISIT_ALL, bitmap); } + +/* How many bits are set in this bitmap? */ +uint64_t +xbitmap_hweight( + struct xbitmap *bitmap) +{ + struct xbitmap_range *bmr; + struct xbitmap_range *n; + uint64_t ret = 0; + + for_each_xbitmap_extent(bmr, n, bitmap) + ret += bmr->len; + + return ret; +} diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index ae8ecbce6fa6..900646b72de1 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -6,31 +6,32 @@ #ifndef __XFS_SCRUB_BITMAP_H__ #define __XFS_SCRUB_BITMAP_H__ -struct xfs_bitmap_range { +struct xbitmap_range { struct list_head list; uint64_t start; uint64_t len; }; -struct xfs_bitmap { +struct xbitmap { struct list_head list; }; -void xfs_bitmap_init(struct xfs_bitmap *bitmap); -void xfs_bitmap_destroy(struct xfs_bitmap *bitmap); +void xbitmap_init(struct xbitmap *bitmap); +void xbitmap_destroy(struct xbitmap *bitmap); -#define for_each_xfs_bitmap_extent(bex, n, bitmap) \ +#define for_each_xbitmap_extent(bex, n, bitmap) \ list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) -#define for_each_xfs_bitmap_block(b, bex, n, bitmap) \ +#define for_each_xbitmap_block(b, bex, n, bitmap) \ list_for_each_entry_safe((bex), (n), &(bitmap)->list, list) \ - for ((b) = bex->start; (b) < bex->start + bex->len; (b)++) + for ((b) = (bex)->start; (b) < (bex)->start + (bex)->len; (b)++) -int xfs_bitmap_set(struct xfs_bitmap *bitmap, uint64_t start, uint64_t len); -int xfs_bitmap_disunion(struct xfs_bitmap *bitmap, struct xfs_bitmap *sub); -int xfs_bitmap_set_btcur_path(struct xfs_bitmap *bitmap, +int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len); +int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub); +int xbitmap_set_btcur_path(struct xbitmap *bitmap, struct xfs_btree_cur *cur); -int xfs_bitmap_set_btblocks(struct xfs_bitmap *bitmap, +int xbitmap_set_btblocks(struct xbitmap *bitmap, struct xfs_btree_cur *cur); +uint64_t xbitmap_hweight(struct xbitmap *bitmap); #endif /* __XFS_SCRUB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index fa6ea6407992..add8598eacd5 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -374,7 +374,7 @@ xchk_bmapbt_rec( struct xfs_bmbt_irec iext_irec; struct xfs_iext_cursor icur; struct xchk_bmap_info *info = bs->private; - struct xfs_inode *ip = bs->cur->bc_private.b.ip; + struct xfs_inode *ip = bs->cur->bc_ino.ip; struct xfs_buf *bp = NULL; struct xfs_btree_block *block; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork); @@ -501,7 +501,7 @@ xchk_bmap_check_rmap( xchk_fblock_set_corrupt(sc, sbcri->whichfork, rec->rm_offset); if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp, - cur->bc_private.a.agno, rec->rm_startblock)) + cur->bc_ag.agno, rec->rm_startblock)) xchk_fblock_set_corrupt(sc, sbcri->whichfork, rec->rm_offset); if (irec.br_blockcount > rec->rm_blockcount) diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 97a15b6f2865..9a2e27ac1300 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -219,19 +219,21 @@ xchk_da_btree_block_check_sibling( int direction, xfs_dablk_t sibling) { + struct xfs_da_state_path *path = &ds->state->path; + struct xfs_da_state_path *altpath = &ds->state->altpath; int retval; + int plevel; int error; - memcpy(&ds->state->altpath, &ds->state->path, - sizeof(ds->state->altpath)); + memcpy(altpath, path, sizeof(ds->state->altpath)); /* * If the pointer is null, we shouldn't be able to move the upper * level pointer anywhere. */ if (sibling == 0) { - error = xfs_da3_path_shift(ds->state, &ds->state->altpath, - direction, false, &retval); + error = xfs_da3_path_shift(ds->state, altpath, direction, + false, &retval); if (error == 0 && retval == 0) xchk_da_set_corrupt(ds, level); error = 0; @@ -239,27 +241,33 @@ xchk_da_btree_block_check_sibling( } /* Move the alternate cursor one block in the direction given. */ - error = xfs_da3_path_shift(ds->state, &ds->state->altpath, - direction, false, &retval); + error = xfs_da3_path_shift(ds->state, altpath, direction, false, + &retval); if (!xchk_da_process_error(ds, level, &error)) - return error; + goto out; if (retval) { xchk_da_set_corrupt(ds, level); - return error; + goto out; } - if (ds->state->altpath.blk[level].bp) - xchk_buffer_recheck(ds->sc, - ds->state->altpath.blk[level].bp); + if (altpath->blk[level].bp) + xchk_buffer_recheck(ds->sc, altpath->blk[level].bp); /* Compare upper level pointer to sibling pointer. */ - if (ds->state->altpath.blk[level].blkno != sibling) + if (altpath->blk[level].blkno != sibling) xchk_da_set_corrupt(ds, level); - if (ds->state->altpath.blk[level].bp) { - xfs_trans_brelse(ds->dargs.trans, - ds->state->altpath.blk[level].bp); - ds->state->altpath.blk[level].bp = NULL; - } + out: + /* Free all buffers in the altpath that aren't referenced from path. */ + for (plevel = 0; plevel < altpath->active; plevel++) { + if (altpath->blk[plevel].bp == NULL || + (plevel < path->active && + altpath->blk[plevel].bp == path->blk[plevel].bp)) + continue; + + xfs_trans_brelse(ds->dargs.trans, altpath->blk[plevel].bp); + altpath->blk[plevel].bp = NULL; + } + return error; } diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 266da4e4bde6..fe2a6e030c8a 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -155,6 +155,9 @@ xchk_dir_actor( xname.type = XFS_DIR3_FT_UNKNOWN; error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL); + /* ENOENT means the hash lookup failed and the dir is corrupt */ + if (error == -ENOENT) + error = -EFSCORRUPTED; if (!xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset, &error)) goto out; @@ -500,7 +503,7 @@ xchk_directory_leaf1_bestfree( /* Read the free space block. */ error = xfs_dir3_leaf_read(sc->tp, sc->ip, lblk, &bp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) - goto out; + return error; xchk_buffer_recheck(sc, bp); leaf = bp->b_addr; @@ -565,9 +568,10 @@ xchk_directory_leaf1_bestfree( xchk_directory_check_freesp(sc, lblk, dbp, best); xfs_trans_brelse(sc->tp, dbp); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - goto out; + break; } out: + xfs_trans_brelse(sc->tp, bp); return error; } @@ -589,7 +593,7 @@ xchk_directory_free_bestfree( /* Read the free space block */ error = xfs_dir2_free_read(sc->tp, sc->ip, lblk, &bp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) - goto out; + return error; xchk_buffer_recheck(sc, bp); if (xfs_sb_version_hascrc(&sc->mp->m_sb)) { @@ -612,7 +616,7 @@ xchk_directory_free_bestfree( 0, &dbp); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) - break; + goto out; xchk_directory_check_freesp(sc, lblk, dbp, best); xfs_trans_brelse(sc->tp, dbp); } @@ -620,6 +624,7 @@ xchk_directory_free_bestfree( if (freehdr.nused + stale != freehdr.nvalid) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); out: + xfs_trans_brelse(sc->tp, bp); return error; } diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 681758704fda..64c217eb06a7 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -104,7 +104,7 @@ xchk_iallocbt_chunk( xfs_extlen_t len) { struct xfs_mount *mp = bs->cur->bc_mp; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agblock_t bno; bno = XFS_AGINO_TO_AGBNO(mp, agino); @@ -164,7 +164,7 @@ xchk_iallocbt_check_cluster_ifree( * the record, compute which fs inode we're talking about. */ agino = irec->ir_startino + irec_ino; - fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); + fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_ag.agno, agino); irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino)); if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || @@ -215,7 +215,7 @@ xchk_iallocbt_check_cluster( struct xfs_dinode *dip; struct xfs_buf *cluster_bp; unsigned int nr_inodes; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agblock_t agbno; unsigned int cluster_index; uint16_t cluster_mask = 0; @@ -426,7 +426,7 @@ xchk_iallocbt_rec( struct xchk_iallocbt *iabt = bs->private; struct xfs_inobt_rec_incore irec; uint64_t holes; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agino_t agino; xfs_extlen_t len; int holecount; diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 0cab11a5d390..beaeb6fa3119 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -336,7 +336,7 @@ xchk_refcountbt_rec( { struct xfs_mount *mp = bs->cur->bc_mp; xfs_agblock_t *cow_blocks = bs->private; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; xfs_agblock_t bno; xfs_extlen_t len; xfs_nlink_t refcount; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index e489d7a8446a..db3cfd12803d 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -208,8 +208,10 @@ xrep_calc_ag_resblks( /* Now grab the block counters from the AGF. */ error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp); if (!error) { - aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length); - freelen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_freeblks); + struct xfs_agf *agf = bp->b_addr; + + aglen = be32_to_cpu(agf->agf_length); + freelen = be32_to_cpu(agf->agf_freeblks); usedlen = aglen - freelen; xfs_buf_relse(bp); } @@ -434,10 +436,10 @@ xrep_init_btblock( int xrep_invalidate_blocks( struct xfs_scrub *sc, - struct xfs_bitmap *bitmap) + struct xbitmap *bitmap) { - struct xfs_bitmap_range *bmr; - struct xfs_bitmap_range *n; + struct xbitmap_range *bmr; + struct xbitmap_range *n; struct xfs_buf *bp; xfs_fsblock_t fsbno; @@ -449,7 +451,7 @@ xrep_invalidate_blocks( * because we never own those; and if we can't TRYLOCK the buffer we * assume it's owned by someone else. */ - for_each_xfs_bitmap_block(fsbno, bmr, n, bitmap) { + for_each_xbitmap_block(fsbno, bmr, n, bitmap) { /* Skip AG headers and post-EOFS blocks */ if (!xfs_verify_fsbno(sc->mp, fsbno)) continue; @@ -595,18 +597,18 @@ out_free: int xrep_reap_extents( struct xfs_scrub *sc, - struct xfs_bitmap *bitmap, + struct xbitmap *bitmap, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type) { - struct xfs_bitmap_range *bmr; - struct xfs_bitmap_range *n; + struct xbitmap_range *bmr; + struct xbitmap_range *n; xfs_fsblock_t fsbno; int error = 0; ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb)); - for_each_xfs_bitmap_block(fsbno, bmr, n, bitmap) { + for_each_xbitmap_block(fsbno, bmr, n, bitmap) { ASSERT(sc->ip != NULL || XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.agno); trace_xrep_dispose_btree_extent(sc->mp, @@ -615,11 +617,9 @@ xrep_reap_extents( error = xrep_reap_block(sc, fsbno, oinfo, type); if (error) - goto out; + break; } -out: - xfs_bitmap_destroy(bitmap); return error; } @@ -879,7 +879,7 @@ xrep_find_ag_btree_roots( ri.sc = sc; ri.btree_info = btree_info; - ri.agf = XFS_BUF_TO_AGF(agf_bp); + ri.agf = agf_bp->b_addr; ri.agfl_bp = agfl_bp; for (fab = btree_info; fab->buf_ops; fab++) { ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG); diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index c3422403b169..04a47d45605b 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -28,11 +28,11 @@ int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb, struct xfs_buf **bpp, xfs_btnum_t btnum, const struct xfs_buf_ops *ops); -struct xfs_bitmap; +struct xbitmap; int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); -int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xfs_bitmap *btlist); -int xrep_reap_extents(struct xfs_scrub *sc, struct xfs_bitmap *exlist, +int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xbitmap *btlist); +int xrep_reap_extents(struct xfs_scrub *sc, struct xbitmap *exlist, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); struct xrep_find_ag_btree { diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 8d4cefd761c1..f4fcb4719f41 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -92,7 +92,7 @@ xchk_rmapbt_rec( { struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_rmap_irec irec; - xfs_agnumber_t agno = bs->cur->bc_private.a.agno; + xfs_agnumber_t agno = bs->cur->bc_ag.agno; bool non_inode; bool is_unwritten; bool is_bmbt; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index f1775bb19313..8ebf35b115ce 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -168,6 +168,7 @@ xchk_teardown( xfs_irele(sc->ip); sc->ip = NULL; } + sb_end_write(sc->mp->m_super); if (sc->flags & XCHK_REAPING_DISABLED) xchk_start_reaping(sc); if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) { @@ -490,6 +491,14 @@ xfs_scrub_metadata( sc.ops = &meta_scrub_ops[sm->sm_type]; sc.sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); retry_op: + /* + * If freeze runs concurrently with a scrub, the freeze can be delayed + * indefinitely as we walk the filesystem and iterate over metadata + * buffers. Freeze quiesces the log (which waits for the buffer LRU to + * be emptied) and that won't happen while checking is running. + */ + sb_start_write(mp->m_super); + /* Set up for the operation. */ error = sc.ops->setup(&sc, ip); if (error) diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 9eaab2eb5ed3..2c6c248be823 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -24,9 +24,9 @@ xchk_btree_cur_fsbno( return XFS_DADDR_TO_FSB(cur->bc_mp, cur->bc_bufs[level]->b_bn); else if (level == cur->bc_nlevels - 1 && cur->bc_flags & XFS_BTREE_LONG_PTRS) - return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_private.b.ip->i_ino); + return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino); else if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS)) - return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, 0); + return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.agno, 0); return NULLFSBLOCK; } diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 096203119934..e46f5cef90da 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -379,7 +379,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error, xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; __entry->ino = sc->ip->i_ino; - __entry->whichfork = cur->bc_private.b.whichfork; + __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; __entry->btnum = cur->bc_btnum; __entry->level = level; @@ -459,7 +459,7 @@ TRACE_EVENT(xchk_ifork_btree_error, xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; __entry->ino = sc->ip->i_ino; - __entry->whichfork = cur->bc_private.b.whichfork; + __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; __entry->btnum = cur->bc_btnum; __entry->level = level; diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index cd743fad8478..d4c687b5cd06 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -14,6 +14,8 @@ #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_acl.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include <linux/posix_acl_xattr.h> @@ -67,10 +69,12 @@ xfs_acl_from_disk( switch (acl_e->e_tag) { case ACL_USER: - acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id)); + acl_e->e_uid = make_kuid(&init_user_ns, + be32_to_cpu(ace->ae_id)); break; case ACL_GROUP: - acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id)); + acl_e->e_gid = make_kgid(&init_user_ns, + be32_to_cpu(ace->ae_id)); break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: @@ -103,10 +107,12 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl) ace->ae_tag = cpu_to_be32(acl_e->e_tag); switch (acl_e->e_tag) { case ACL_USER: - ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid)); + ace->ae_id = cpu_to_be32( + from_kuid(&init_user_ns, acl_e->e_uid)); break; case ACL_GROUP: - ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid)); + ace->ae_id = cpu_to_be32( + from_kgid(&init_user_ns, acl_e->e_gid)); break; default: ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID); @@ -120,102 +126,86 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl) struct posix_acl * xfs_get_acl(struct inode *inode, int type) { - struct xfs_inode *ip = XFS_I(inode); - struct posix_acl *acl = NULL; - struct xfs_acl *xfs_acl = NULL; - unsigned char *ea_name; - int error; - int len; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct posix_acl *acl = NULL; + struct xfs_da_args args = { + .dp = ip, + .attr_filter = XFS_ATTR_ROOT, + .valuelen = XFS_ACL_MAX_SIZE(mp), + }; + int error; trace_xfs_get_acl(ip); switch (type) { case ACL_TYPE_ACCESS: - ea_name = SGI_ACL_FILE; + args.name = SGI_ACL_FILE; break; case ACL_TYPE_DEFAULT: - ea_name = SGI_ACL_DEFAULT; + args.name = SGI_ACL_DEFAULT; break; default: BUG(); } + args.namelen = strlen(args.name); /* - * If we have a cached ACLs value just return it, not need to - * go out to the disk. + * If the attribute doesn't exist make sure we have a negative cache + * entry, for any other error assume it is transient. */ - len = XFS_ACL_MAX_SIZE(ip->i_mount); - error = xfs_attr_get(ip, ea_name, strlen(ea_name), - (unsigned char **)&xfs_acl, &len, - ATTR_ALLOC | ATTR_ROOT); - if (error) { - /* - * If the attribute doesn't exist make sure we have a negative - * cache entry, for any other error assume it is transient. - */ - if (error != -ENOATTR) - acl = ERR_PTR(error); - } else { - acl = xfs_acl_from_disk(ip->i_mount, xfs_acl, len, - XFS_ACL_MAX_ENTRIES(ip->i_mount)); - kmem_free(xfs_acl); + error = xfs_attr_get(&args); + if (!error) { + acl = xfs_acl_from_disk(mp, args.value, args.valuelen, + XFS_ACL_MAX_ENTRIES(mp)); + } else if (error != -ENOATTR) { + acl = ERR_PTR(error); } + + kmem_free(args.value); return acl; } int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct xfs_inode *ip = XFS_I(inode); - unsigned char *ea_name; - int error; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_da_args args = { + .dp = ip, + .attr_filter = XFS_ATTR_ROOT, + }; + int error; switch (type) { case ACL_TYPE_ACCESS: - ea_name = SGI_ACL_FILE; + args.name = SGI_ACL_FILE; break; case ACL_TYPE_DEFAULT: if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; - ea_name = SGI_ACL_DEFAULT; + args.name = SGI_ACL_DEFAULT; break; default: return -EINVAL; } + args.namelen = strlen(args.name); if (acl) { - struct xfs_acl *xfs_acl; - int len = XFS_ACL_MAX_SIZE(ip->i_mount); - - xfs_acl = kmem_zalloc_large(len, 0); - if (!xfs_acl) + args.valuelen = XFS_ACL_SIZE(acl->a_count); + args.value = kmem_zalloc_large(args.valuelen, 0); + if (!args.value) return -ENOMEM; - - xfs_acl_to_disk(xfs_acl, acl); - - /* subtract away the unused acl entries */ - len -= sizeof(struct xfs_acl_entry) * - (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count); - - error = xfs_attr_set(ip, ea_name, strlen(ea_name), - (unsigned char *)xfs_acl, len, ATTR_ROOT); - - kmem_free(xfs_acl); - } else { - /* - * A NULL ACL argument means we want to remove the ACL. - */ - error = xfs_attr_remove(ip, ea_name, - strlen(ea_name), - ATTR_ROOT); - - /* - * If the attribute didn't exist to start with that's fine. - */ - if (error == -ENOATTR) - error = 0; + xfs_acl_to_disk(args.value, acl); } + error = xfs_attr_set(&args); + kmem_free(args.value); + + /* + * If the attribute didn't exist to start with that's fine. + */ + if (!acl && error == -ENOATTR) + error = 0; if (!error) set_cached_acl(inode, type, acl); return error; @@ -275,3 +265,19 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) return error; } + +/* + * Invalidate any cached ACLs if the user has bypassed the ACL interface. + * We don't validate the content whatsoever so it is caller responsibility to + * provide data in valid format and ensure i_mode is consistent. + */ +void +xfs_forget_acl( + struct inode *inode, + const char *name) +{ + if (!strcmp(name, SGI_ACL_FILE)) + forget_cached_acl(inode, ACL_TYPE_ACCESS); + else if (!strcmp(name, SGI_ACL_DEFAULT)) + forget_cached_acl(inode, ACL_TYPE_DEFAULT); +} diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 94615e34bc86..c042c0868016 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -13,14 +13,16 @@ struct posix_acl; extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +void xfs_forget_acl(struct inode *inode, const char *name); #else static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) { return NULL; } # define xfs_set_acl NULL +static inline void xfs_forget_acl(struct inode *inode, const char *name) +{ +} #endif /* CONFIG_XFS_POSIX_ACL */ -extern void xfs_forget_acl(struct inode *inode, const char *name, int xflags); - #endif /* __XFS_ACL_H__ */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 58e937be24ce..9d9cebf18726 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -539,7 +539,7 @@ xfs_discard_page( if (XFS_FORCED_SHUTDOWN(mp)) goto out_invalidate; - xfs_alert(mp, + xfs_alert_ratelimited(mp, "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.", page, ip->i_ino, offset); diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index bbfa6ba84dcd..c42f90e16b4f 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -145,8 +145,8 @@ xfs_attr3_node_inactive( * Since this code is recursive (gasp!) we must protect ourselves. */ if (level > XFS_DA_NODE_MAXDEPTH) { + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(*trans, bp); /* no locks for later trans */ - xfs_buf_corruption_error(bp); return -EFSCORRUPTED; } @@ -194,7 +194,7 @@ xfs_attr3_node_inactive( error = xfs_attr3_leaf_inactive(trans, dp, child_bp); break; default: - xfs_buf_corruption_error(child_bp); + xfs_buf_mark_corrupt(child_bp); xfs_trans_brelse(*trans, child_bp); error = -EFSCORRUPTED; break; @@ -289,7 +289,7 @@ xfs_attr3_root_inactive( break; default: error = -EFSCORRUPTED; - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(*trans, bp); break; } diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index d37743bdf274..5ff1d929d3b5 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -52,24 +52,19 @@ static int xfs_attr_shortform_list( struct xfs_attr_list_context *context) { - struct attrlist_cursor_kern *cursor; + struct xfs_attrlist_cursor_kern *cursor = &context->cursor; + struct xfs_inode *dp = context->dp; struct xfs_attr_sf_sort *sbuf, *sbp; struct xfs_attr_shortform *sf; struct xfs_attr_sf_entry *sfe; - struct xfs_inode *dp; int sbsize, nsbuf, count, i; int error = 0; - ASSERT(context != NULL); - dp = context->dp; - ASSERT(dp != NULL); ASSERT(dp->i_afp != NULL); sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; ASSERT(sf != NULL); if (!sf->hdr.count) return 0; - cursor = context->cursor; - ASSERT(cursor != NULL); trace_xfs_attr_list_sf(context); @@ -205,7 +200,7 @@ out: STATIC int xfs_attr_node_list_lookup( struct xfs_attr_list_context *context, - struct attrlist_cursor_kern *cursor, + struct xfs_attrlist_cursor_kern *cursor, struct xfs_buf **pbp) { struct xfs_da3_icnode_hdr nodehdr; @@ -279,7 +274,7 @@ xfs_attr_node_list_lookup( return 0; out_corruptbuf: - xfs_buf_corruption_error(bp); + xfs_buf_mark_corrupt(bp); xfs_trans_brelse(tp, bp); return -EFSCORRUPTED; } @@ -288,8 +283,8 @@ STATIC int xfs_attr_node_list( struct xfs_attr_list_context *context) { + struct xfs_attrlist_cursor_kern *cursor = &context->cursor; struct xfs_attr3_icleaf_hdr leafhdr; - struct attrlist_cursor_kern *cursor; struct xfs_attr_leafblock *leaf; struct xfs_da_intnode *node; struct xfs_buf *bp; @@ -299,7 +294,6 @@ xfs_attr_node_list( trace_xfs_attr_node_list(context); - cursor = context->cursor; cursor->initted = 1; /* @@ -394,7 +388,7 @@ xfs_attr3_leaf_list_int( struct xfs_buf *bp, struct xfs_attr_list_context *context) { - struct attrlist_cursor_kern *cursor; + struct xfs_attrlist_cursor_kern *cursor = &context->cursor; struct xfs_attr_leafblock *leaf; struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entries; @@ -408,7 +402,6 @@ xfs_attr3_leaf_list_int( xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); - cursor = context->cursor; cursor->initted = 1; /* @@ -452,8 +445,8 @@ xfs_attr3_leaf_list_int( } if ((entry->flags & XFS_ATTR_INCOMPLETE) && - !(context->flags & ATTR_INCOMPLETE)) - continue; /* skip incomplete entries */ + !context->allow_incomplete) + continue; if (entry->flags & XFS_ATTR_LOCAL) { xfs_attr_leaf_name_local_t *name_loc; @@ -488,14 +481,15 @@ xfs_attr3_leaf_list_int( * Copy out attribute entries for attr_list(), for leaf attribute lists. */ STATIC int -xfs_attr_leaf_list(xfs_attr_list_context_t *context) +xfs_attr_leaf_list( + struct xfs_attr_list_context *context) { - int error; - struct xfs_buf *bp; + struct xfs_buf *bp; + int error; trace_xfs_attr_leaf_list(context); - context->cursor->blkno = 0; + context->cursor.blkno = 0; error = xfs_attr3_leaf_read(context->tp, context->dp, 0, &bp); if (error) return error; @@ -506,7 +500,7 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) } int -xfs_attr_list_int_ilocked( +xfs_attr_list_ilocked( struct xfs_attr_list_context *context) { struct xfs_inode *dp = context->dp; @@ -526,12 +520,12 @@ xfs_attr_list_int_ilocked( } int -xfs_attr_list_int( - xfs_attr_list_context_t *context) +xfs_attr_list( + struct xfs_attr_list_context *context) { - int error; - xfs_inode_t *dp = context->dp; - uint lock_mode; + struct xfs_inode *dp = context->dp; + uint lock_mode; + int error; XFS_STATS_INC(dp->i_mount, xs_attr_list); @@ -539,130 +533,7 @@ xfs_attr_list_int( return -EIO; lock_mode = xfs_ilock_attr_map_shared(dp); - error = xfs_attr_list_int_ilocked(context); + error = xfs_attr_list_ilocked(context); xfs_iunlock(dp, lock_mode); return error; } - -#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ - (((struct attrlist_ent *) 0)->a_name - (char *) 0) -#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ - ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(uint32_t)-1) \ - & ~(sizeof(uint32_t)-1)) - -/* - * Format an attribute and copy it out to the user's buffer. - * Take care to check values and protect against them changing later, - * we may be reading them directly out of a user buffer. - */ -STATIC void -xfs_attr_put_listent( - xfs_attr_list_context_t *context, - int flags, - unsigned char *name, - int namelen, - int valuelen) -{ - struct attrlist *alist = (struct attrlist *)context->alist; - attrlist_ent_t *aep; - int arraytop; - - ASSERT(!context->seen_enough); - ASSERT(!(context->flags & ATTR_KERNOVAL)); - ASSERT(context->count >= 0); - ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); - ASSERT(context->firstu >= sizeof(*alist)); - ASSERT(context->firstu <= context->bufsize); - - /* - * Only list entries in the right namespace. - */ - if (((context->flags & ATTR_SECURE) == 0) != - ((flags & XFS_ATTR_SECURE) == 0)) - return; - if (((context->flags & ATTR_ROOT) == 0) != - ((flags & XFS_ATTR_ROOT) == 0)) - return; - - arraytop = sizeof(*alist) + - context->count * sizeof(alist->al_offset[0]); - context->firstu -= ATTR_ENTSIZE(namelen); - if (context->firstu < arraytop) { - trace_xfs_attr_list_full(context); - alist->al_more = 1; - context->seen_enough = 1; - return; - } - - aep = (attrlist_ent_t *)&context->alist[context->firstu]; - aep->a_valuelen = valuelen; - memcpy(aep->a_name, name, namelen); - aep->a_name[namelen] = 0; - alist->al_offset[context->count++] = context->firstu; - alist->al_count = context->count; - trace_xfs_attr_list_add(context); - return; -} - -/* - * Generate a list of extended attribute names and optionally - * also value lengths. Positive return value follows the XFS - * convention of being an error, zero or negative return code - * is the length of the buffer returned (negated), indicating - * success. - */ -int -xfs_attr_list( - xfs_inode_t *dp, - char *buffer, - int bufsize, - int flags, - attrlist_cursor_kern_t *cursor) -{ - xfs_attr_list_context_t context; - struct attrlist *alist; - int error; - - /* - * Validate the cursor. - */ - if (cursor->pad1 || cursor->pad2) - return -EINVAL; - if ((cursor->initted == 0) && - (cursor->hashval || cursor->blkno || cursor->offset)) - return -EINVAL; - - /* Only internal consumers can retrieve incomplete attrs. */ - if (flags & ATTR_INCOMPLETE) - return -EINVAL; - - /* - * Check for a properly aligned buffer. - */ - if (((long)buffer) & (sizeof(int)-1)) - return -EFAULT; - if (flags & ATTR_KERNOVAL) - bufsize = 0; - - /* - * Initialize the output buffer. - */ - memset(&context, 0, sizeof(context)); - context.dp = dp; - context.cursor = cursor; - context.resynch = 1; - context.flags = flags; - context.alist = buffer; - context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ - context.firstu = context.bufsize; - context.put_listent = xfs_attr_put_listent; - - alist = (struct attrlist *)context.alist; - alist->al_count = 0; - alist->al_more = 0; - alist->al_offset[0] = context.bufsize; - - error = xfs_attr_list_int(&context); - ASSERT(error <= 0); - return error; -} diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index e62fb5216341..4f800f7fe888 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1062,7 +1062,6 @@ xfs_collapse_file_space( int error; xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len); xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len); - uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); bool done = false; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); @@ -1078,32 +1077,34 @@ xfs_collapse_file_space( if (error) return error; - while (!error && !done) { - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, - &tp); - if (error) - break; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); + if (error) + return error; - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, - ip->i_gdquot, ip->i_pdquot, resblks, 0, - XFS_QMOPT_RES_REGBLKS); - if (error) - goto out_trans_cancel; - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + while (!done) { error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb, &done); if (error) goto out_trans_cancel; + if (done) + break; - error = xfs_trans_commit(tp); + /* finish any deferred frees and roll the transaction */ + error = xfs_defer_finish(&tp); + if (error) + goto out_trans_cancel; } + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; out_trans_cancel: xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1146,35 +1147,41 @@ xfs_insert_file_space( if (error) return error; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + /* * The extent shifting code works on extent granularity. So, if stop_fsb * is not the starting block of extent, we need to split the extent at * stop_fsb. */ - error = xfs_bmap_split_extent(ip, stop_fsb); + error = xfs_bmap_split_extent(tp, ip, stop_fsb); if (error) - return error; + goto out_trans_cancel; - while (!error && !done) { - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, - &tp); + do { + error = xfs_trans_roll_inode(&tp, ip); if (error) - break; + goto out_trans_cancel; - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb, &done, stop_fsb); if (error) goto out_trans_cancel; + } while (!done); - error = xfs_trans_commit(tp); - } - + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; out_trans_cancel: xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1442,12 +1449,12 @@ xfs_swap_extent_forks( * event of a crash. Set the owner change log flags now and leave the * bmbt scan as the last step. */ - if (ip->i_d.di_version == 3 && - ip->i_d.di_format == XFS_DINODE_FMT_BTREE) - (*target_log_flags) |= XFS_ILOG_DOWNER; - if (tip->i_d.di_version == 3 && - tip->i_d.di_format == XFS_DINODE_FMT_BTREE) - (*src_log_flags) |= XFS_ILOG_DOWNER; + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) + (*target_log_flags) |= XFS_ILOG_DOWNER; + if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) + (*src_log_flags) |= XFS_ILOG_DOWNER; + } /* * Swap the data forks of the inodes @@ -1482,7 +1489,7 @@ xfs_swap_extent_forks( (*src_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: - ASSERT(ip->i_d.di_version < 3 || + ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) || (*src_log_flags & XFS_ILOG_DOWNER)); (*src_log_flags) |= XFS_ILOG_DBROOT; break; @@ -1494,7 +1501,7 @@ xfs_swap_extent_forks( break; case XFS_DINODE_FMT_BTREE: (*target_log_flags) |= XFS_ILOG_DBROOT; - ASSERT(tip->i_d.di_version < 3 || + ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) || (*target_log_flags & XFS_ILOG_DOWNER)); break; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 217e4f82a44a..9ec3eaf1c618 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -327,6 +327,9 @@ xfs_buf_free( __free_page(page); } + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += + bp->b_page_count; } else if (bp->b_flags & _XBF_KMEM) kmem_free(bp->b_addr); _xfs_buf_free_pages(bp); @@ -727,8 +730,9 @@ found: if (!bp->b_addr) { error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { - xfs_warn(target->bt_mount, - "%s: failed to map pagesn", __func__); + xfs_warn_ratelimited(target->bt_mount, + "%s: failed to map %u pages", __func__, + bp->b_page_count); xfs_buf_relse(bp); return error; } @@ -1238,7 +1242,7 @@ xfs_buf_ioerror_alert( struct xfs_buf *bp, xfs_failaddr_t func) { - xfs_alert(bp->b_mount, + xfs_alert_ratelimited(bp->b_mount, "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length, -bp->b_error); @@ -1573,6 +1577,28 @@ xfs_buf_zero( } /* + * Log a message about and stale a buffer that a caller has decided is corrupt. + * + * This function should be called for the kinds of metadata corruption that + * cannot be detect from a verifier, such as incorrect inter-block relationship + * data. Do /not/ call this function from a verifier function. + * + * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will + * be marked stale, but b_error will not be set. The caller is responsible for + * releasing the buffer or fixing it. + */ +void +__xfs_buf_mark_corrupt( + struct xfs_buf *bp, + xfs_failaddr_t fa) +{ + ASSERT(bp->b_flags & XBF_DONE); + + xfs_buf_corruption_error(bp, fa); + xfs_buf_stale(bp); +} + +/* * Handling of buffer targets (buftargs). */ @@ -2091,9 +2117,11 @@ xfs_buf_delwri_pushbuf( int __init xfs_buf_init(void) { - xfs_buf_zone = kmem_cache_create("xfs_buf", - sizeof(struct xfs_buf), 0, - SLAB_HWCACHE_ALIGN, NULL); + xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0, + SLAB_HWCACHE_ALIGN | + SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD, + NULL); if (!xfs_buf_zone) goto out; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index d79a1fe5d738..9a04c53c2488 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -272,6 +272,8 @@ static inline int xfs_buf_submit(struct xfs_buf *bp) } void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize); +void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa); +#define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address) /* Buffer Utility Routines */ extern void *xfs_buf_offset(struct xfs_buf *, size_t); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 663810e6cd59..1545657c3ca0 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -345,7 +345,7 @@ xfs_buf_item_format( * occurs during recovery. */ if (bip->bli_flags & XFS_BLI_INODE_BUF) { - if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) || + if (xfs_sb_version_has_v3inode(&lip->li_mountp->m_sb) || !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && xfs_log_item_in_current_chkpt(lip))) bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 0d3b640cf1cc..871ec22c9aee 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -147,7 +147,7 @@ xfs_dir2_block_getdents( xfs_off_t cook; struct xfs_da_geometry *geo = args->geo; int lock_mode; - unsigned int offset; + unsigned int offset, next_offset; unsigned int end; /* @@ -173,9 +173,10 @@ xfs_dir2_block_getdents( * Loop over the data portion of the block. * Each object is a real entry (dep) or an unused one (dup). */ - offset = geo->data_entry_offset; end = xfs_dir3_data_end_offset(geo, bp->b_addr); - while (offset < end) { + for (offset = geo->data_entry_offset; + offset < end; + offset = next_offset) { struct xfs_dir2_data_unused *dup = bp->b_addr + offset; struct xfs_dir2_data_entry *dep = bp->b_addr + offset; uint8_t filetype; @@ -184,14 +185,15 @@ xfs_dir2_block_getdents( * Unused, skip it. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - offset += be16_to_cpu(dup->length); + next_offset = offset + be16_to_cpu(dup->length); continue; } /* * Bump pointer for the next iteration. */ - offset += xfs_dir2_data_entsize(dp->i_mount, dep->namelen); + next_offset = offset + + xfs_dir2_data_entsize(dp->i_mount, dep->namelen); /* * The entry is before the desired starting point, skip it. diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 0b8350e84d28..f979d0d7e6cd 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -31,6 +31,7 @@ xfs_trim_extents( struct block_device *bdev = mp->m_ddev_targp->bt_bdev; struct xfs_btree_cur *cur; struct xfs_buf *agbp; + struct xfs_agf *agf; struct xfs_perag *pag; int error; int i; @@ -47,14 +48,14 @@ xfs_trim_extents( error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); if (error) goto out_put_perag; + agf = agbp->b_addr; cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); /* * Look up the longest btree in the AGF and start with it. */ - error = xfs_alloc_lookup_ge(cur, 0, - be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i); + error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(agf->agf_longest), &i); if (error) goto out_del_cursor; @@ -75,7 +76,7 @@ xfs_trim_extents( error = -EFSCORRUPTED; goto out_del_cursor; } - ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); + ASSERT(flen <= be32_to_cpu(agf->agf_longest)); /* * use daddr format for all range/len calculations as that is diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index d223e1ae90a6..af2c8e5ceea0 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -829,9 +829,9 @@ xfs_qm_id_for_quotatype( { switch (type) { case XFS_DQ_USER: - return ip->i_d.di_uid; + return i_uid_read(VFS_I(ip)); case XFS_DQ_GROUP: - return ip->i_d.di_gid; + return i_gid_read(VFS_I(ip)); case XFS_DQ_PROJ: return ip->i_d.di_projid; } @@ -1105,8 +1105,8 @@ xfs_qm_dqflush( * Get the buffer containing the on-disk dquot */ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0, &bp, - &xfs_dquot_buf_ops); + mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK, + &bp, &xfs_dquot_buf_ops); if (error) goto out_unlock; @@ -1177,7 +1177,7 @@ xfs_qm_dqflush( out_unlock: xfs_dqfunlock(dqp); - return -EIO; + return error; } /* diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index d60647d7197b..baad1748d0d1 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -189,7 +189,8 @@ xfs_qm_dquot_logitem_push( if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); - } + } else if (error == -EAGAIN) + rval = XFS_ITEM_LOCKED; spin_lock(&lip->li_ailp->ail_lock); out_unlock: @@ -307,36 +308,62 @@ xfs_qm_qoffend_logitem_committed( { struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip); struct xfs_qoff_logitem *qfs = qfe->qql_start_lip; - struct xfs_ail *ailp = qfs->qql_item.li_ailp; - /* - * Delete the qoff-start logitem from the AIL. - * xfs_trans_ail_delete() drops the AIL lock. - */ - spin_lock(&ailp->ail_lock); - xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); + xfs_qm_qoff_logitem_relse(qfs); - kmem_free(qfs->qql_item.li_lv_shadow); kmem_free(lip->li_lv_shadow); - kmem_free(qfs); kmem_free(qfe); return (xfs_lsn_t)-1; } +STATIC void +xfs_qm_qoff_logitem_release( + struct xfs_log_item *lip) +{ + struct xfs_qoff_logitem *qoff = QOFF_ITEM(lip); + + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { + if (qoff->qql_start_lip) + xfs_qm_qoff_logitem_relse(qoff->qql_start_lip); + xfs_qm_qoff_logitem_relse(qoff); + } +} + static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_committed = xfs_qm_qoffend_logitem_committed, .iop_push = xfs_qm_qoff_logitem_push, + .iop_release = xfs_qm_qoff_logitem_release, }; static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_push = xfs_qm_qoff_logitem_push, + .iop_release = xfs_qm_qoff_logitem_release, }; /* + * Delete the quotaoff intent from the AIL and free it. On success, + * this should only be called for the start item. It can be used for + * either on shutdown or abort. + */ +void +xfs_qm_qoff_logitem_relse( + struct xfs_qoff_logitem *qoff) +{ + struct xfs_log_item *lip = &qoff->qql_item; + + ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags) || + test_bit(XFS_LI_ABORTED, &lip->li_flags) || + XFS_FORCED_SHUTDOWN(lip->li_mountp)); + xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); + kmem_free(lip->li_lv_shadow); + kmem_free(qoff); +} + +/* * Allocate and initialize an quotaoff item of the correct quota type(s). */ struct xfs_qoff_logitem * diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 3bb19e556ade..2b86a43d7ce2 100644 --- a/fs/xfs/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h @@ -28,6 +28,7 @@ void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp); struct xfs_qoff_logitem *xfs_qm_qoff_logitem_init(struct xfs_mount *mp, struct xfs_qoff_logitem *start, uint flags); +void xfs_qm_qoff_logitem_relse(struct xfs_qoff_logitem *); struct xfs_qoff_logitem *xfs_trans_get_qoff_item(struct xfs_trans *tp, struct xfs_qoff_logitem *startqoff, uint flags); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 331765afc53e..a21e9cc6516a 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -345,16 +345,19 @@ xfs_corruption_error( * Complain about the kinds of metadata corruption that we can't detect from a * verifier, such as incorrect inter-block relationship data. Does not set * bp->b_error. + * + * Call xfs_buf_mark_corrupt, not this function. */ void xfs_buf_corruption_error( - struct xfs_buf *bp) + struct xfs_buf *bp, + xfs_failaddr_t fa) { struct xfs_mount *mp = bp->b_mount; xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR, "Metadata corruption detected at %pS, %s block 0x%llx", - __return_address, bp->b_ops->name, bp->b_bn); + fa, bp->b_ops->name, bp->b_bn); xfs_alert(mp, "Unmount and run xfs_repair"); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 31a5d321ba9a..1717b7508356 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -15,7 +15,7 @@ extern void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, const void *buf, size_t bufsize, const char *filename, int linenum, xfs_failaddr_t failaddr); -void xfs_buf_corruption_error(struct xfs_buf *bp); +void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa); extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error, const char *name, const void *buf, size_t bufsz, xfs_failaddr_t failaddr); diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index f1372f9046e3..5a4b0119143a 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -15,7 +15,6 @@ #include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_icache.h" -#include "xfs_log.h" #include "xfs_pnfs.h" /* @@ -221,18 +220,7 @@ STATIC int xfs_fs_nfs_commit_metadata( struct inode *inode) { - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - xfs_lsn_t lsn = 0; - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (!lsn) - return 0; - return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_inode(XFS_I(inode)); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index b8a4a3f29b36..4b8bdecc3863 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -80,19 +80,9 @@ xfs_dir_fsync( int datasync) { struct xfs_inode *ip = XFS_I(file->f_mapping->host); - struct xfs_mount *mp = ip->i_mount; - xfs_lsn_t lsn = 0; trace_xfs_dir_fsync(ip); - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - lsn = ip->i_itemp->ili_last_lsn; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (!lsn) - return 0; - return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_inode(ip); } STATIC int @@ -1069,7 +1059,11 @@ xfs_file_remap_range( ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, remap_flags); + if (ret) + goto out_unlock; + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_log_force_inode(dest); out_unlock: xfs_reflink_remap_unlock(file_in, file_out); if (ret) diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 918456ca29e1..4eebcec4aae6 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -344,7 +344,7 @@ xfs_getfsmap_datadev_helper( xfs_fsblock_t fsb; xfs_daddr_t rec_daddr; - fsb = XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, rec->rm_startblock); + fsb = XFS_AGB_TO_FSB(mp, cur->bc_ag.agno, rec->rm_startblock); rec_daddr = XFS_FSB_TO_DADDR(mp, fsb); return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr); @@ -362,7 +362,7 @@ xfs_getfsmap_datadev_bnobt_helper( struct xfs_rmap_irec irec; xfs_daddr_t rec_daddr; - rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_private.a.agno, + rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_ag.agno, rec->ar_startblock); irec.rm_startblock = rec->ar_startblock; @@ -896,6 +896,14 @@ xfs_getfsmap( info.format_arg = arg; info.head = head; + /* + * If fsmap runs concurrently with a scrub, the freeze can be delayed + * indefinitely as we walk the rmapbt and iterate over metadata + * buffers. Freeze quiesces the log (which waits for the buffer LRU to + * be emptied) and that won't happen while we're reading buffers. + */ + sb_start_write(mp->m_super); + /* For each device we support... */ for (i = 0; i < XFS_GETFSMAP_DEVS; i++) { /* Is this device within the range the user asked for? */ @@ -935,6 +943,7 @@ xfs_getfsmap( if (tp) xfs_trans_cancel(tp); + sb_end_write(mp->m_super); head->fmh_oflags = FMH_OF_DEV_T; return error; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 8dc2e5414276..a7be7a9e5c1a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -289,6 +289,8 @@ xfs_reinit_inode( uint64_t version = inode_peek_iversion(inode); umode_t mode = inode->i_mode; dev_t dev = inode->i_rdev; + kuid_t uid = inode->i_uid; + kgid_t gid = inode->i_gid; error = inode_init_always(mp->m_super, inode); @@ -297,6 +299,8 @@ xfs_reinit_inode( inode_set_iversion_queried(inode, version); inode->i_mode = mode; inode->i_rdev = dev; + inode->i_uid = uid; + inode->i_gid = gid; return error; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c5077e6326c7..d1772786af29 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -801,26 +801,18 @@ xfs_ialloc( return error; ASSERT(ip != NULL); inode = VFS_I(ip); - - /* - * We always convert v1 inodes to v2 now - we only support filesystems - * with >= v2 inode capability, so there is no reason for ever leaving - * an inode in v1 format. - */ - if (ip->i_d.di_version == 1) - ip->i_d.di_version = 2; - inode->i_mode = mode; set_nlink(inode, nlink); - ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid()); - ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid()); + inode->i_uid = current_fsuid(); inode->i_rdev = rdev; ip->i_d.di_projid = prid; if (pip && XFS_INHERIT_GID(pip)) { - ip->i_d.di_gid = pip->i_d.di_gid; + inode->i_gid = VFS_I(pip)->i_gid; if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode)) inode->i_mode |= S_ISGID; + } else { + inode->i_gid = current_fsgid(); } /* @@ -828,9 +820,8 @@ xfs_ialloc( * ID or one of the supplementary group IDs, the S_ISGID bit is cleared * (and only if the irix_sgid_inherit compatibility variable is set). */ - if ((irix_sgid_inherit) && - (inode->i_mode & S_ISGID) && - (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) + if (irix_sgid_inherit && + (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid)) inode->i_mode &= ~S_ISGID; ip->i_d.di_size = 0; @@ -847,14 +838,13 @@ xfs_ialloc( ip->i_d.di_dmstate = 0; ip->i_d.di_flags = 0; - if (ip->i_d.di_version == 3) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { inode_set_iversion(inode, 1); ip->i_d.di_flags2 = 0; ip->i_d.di_cowextsize = 0; ip->i_d.di_crtime = tv; } - flags = XFS_ILOG_CORE; switch (mode & S_IFMT) { case S_IFIFO: @@ -907,20 +897,13 @@ xfs_ialloc( ip->i_d.di_flags |= di_flags; } - if (pip && - (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) && - pip->i_d.di_version == 3 && - ip->i_d.di_version == 3) { - uint64_t di_flags2 = 0; - + if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) { if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { - di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; } if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) - di_flags2 |= XFS_DIFLAG2_DAX; - - ip->i_d.di_flags2 |= di_flags2; + ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX; } /* FALLTHROUGH */ case S_IFLNK: @@ -1122,7 +1105,6 @@ xfs_bumplink( { xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - ASSERT(ip->i_d.di_version > 1); inc_nlink(VFS_I(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -1158,8 +1140,7 @@ xfs_create( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), - xfs_kgid_to_gid(current_fsgid()), prid, + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -1219,8 +1200,7 @@ xfs_create( unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, - resblks ? - resblks - XFS_IALLOC_SPACE_RES(mp) : 0); + resblks - XFS_IALLOC_SPACE_RES(mp)); if (error) { ASSERT(error != -ENOSPC); goto out_trans_cancel; @@ -1309,8 +1289,7 @@ xfs_create_tmpfile( /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), - xfs_kgid_to_gid(current_fsgid()), prid, + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -2119,7 +2098,7 @@ xfs_iunlink_update_bucket( unsigned int bucket_index, xfs_agino_t new_agino) { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + struct xfs_agi *agi = agibp->b_addr; xfs_agino_t old_value; int offset; @@ -2135,7 +2114,7 @@ xfs_iunlink_update_bucket( * head of the list. */ if (old_value == new_agino) { - xfs_buf_corruption_error(agibp); + xfs_buf_mark_corrupt(agibp); return -EFSCORRUPTED; } @@ -2259,7 +2238,7 @@ xfs_iunlink( error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; /* * Get the index into the agi hash table for the list this inode will @@ -2269,7 +2248,7 @@ xfs_iunlink( next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); if (next_agino == agino || !xfs_verify_agino_or_null(mp, agno, next_agino)) { - xfs_buf_corruption_error(agibp); + xfs_buf_mark_corrupt(agibp); return -EFSCORRUPTED; } @@ -2443,7 +2422,7 @@ xfs_iunlink_remove( error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; /* * Get the index into the agi hash table for the list this inode will @@ -2524,6 +2503,88 @@ out: } /* + * Look up the inode number specified and mark it stale if it is found. If it is + * dirty, return the inode so it can be attached to the cluster buffer so it can + * be processed appropriately when the cluster free transaction completes. + */ +static struct xfs_inode * +xfs_ifree_get_one_inode( + struct xfs_perag *pag, + struct xfs_inode *free_ip, + xfs_ino_t inum) +{ + struct xfs_mount *mp = pag->pag_mount; + struct xfs_inode *ip; + +retry: + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); + + /* Inode not in memory, nothing to do */ + if (!ip) + goto out_rcu_unlock; + + /* + * because this is an RCU protected lookup, we could find a recently + * freed or even reallocated inode during the lookup. We need to check + * under the i_flags_lock for a valid inode here. Skip it if it is not + * valid, the wrong inode or stale. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) { + spin_unlock(&ip->i_flags_lock); + goto out_rcu_unlock; + } + spin_unlock(&ip->i_flags_lock); + + /* + * Don't try to lock/unlock the current inode, but we _cannot_ skip the + * other inodes that we did not find in the list attached to the buffer + * and are not already marked stale. If we can't lock it, back off and + * retry. + */ + if (ip != free_ip) { + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + rcu_read_unlock(); + delay(1); + goto retry; + } + + /* + * Check the inode number again in case we're racing with + * freeing in xfs_reclaim_inode(). See the comments in that + * function for more information as to why the initial check is + * not sufficient. + */ + if (ip->i_ino != inum) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto out_rcu_unlock; + } + } + rcu_read_unlock(); + + xfs_iflock(ip); + xfs_iflags_set(ip, XFS_ISTALE); + + /* + * We don't need to attach clean inodes or those only with unlogged + * changes (which we throw away, anyway). + */ + if (!ip->i_itemp || xfs_inode_clean(ip)) { + ASSERT(ip != free_ip); + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto out_no_inode; + } + return ip; + +out_rcu_unlock: + rcu_read_unlock(); +out_no_inode: + return NULL; +} + +/* * A big issue when freeing the inode cluster is that we _cannot_ skip any * inodes that are in memory - they all must be marked stale and attached to * the cluster buffer. @@ -2623,77 +2684,11 @@ xfs_ifree_cluster( * even trying to lock them. */ for (i = 0; i < igeo->inodes_per_cluster; i++) { -retry: - rcu_read_lock(); - ip = radix_tree_lookup(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, (inum + i))); - - /* Inode not in memory, nothing to do */ - if (!ip) { - rcu_read_unlock(); + ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i); + if (!ip) continue; - } - - /* - * because this is an RCU protected lookup, we could - * find a recently freed or even reallocated inode - * during the lookup. We need to check under the - * i_flags_lock for a valid inode here. Skip it if it - * is not valid, the wrong inode or stale. - */ - spin_lock(&ip->i_flags_lock); - if (ip->i_ino != inum + i || - __xfs_iflags_test(ip, XFS_ISTALE)) { - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - continue; - } - spin_unlock(&ip->i_flags_lock); - - /* - * Don't try to lock/unlock the current inode, but we - * _cannot_ skip the other inodes that we did not find - * in the list attached to the buffer and are not - * already marked stale. If we can't lock it, back off - * and retry. - */ - if (ip != free_ip) { - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - rcu_read_unlock(); - delay(1); - goto retry; - } - - /* - * Check the inode number again in case we're - * racing with freeing in xfs_reclaim_inode(). - * See the comments in that function for more - * information as to why the initial check is - * not sufficient. - */ - if (ip->i_ino != inum + i) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - rcu_read_unlock(); - continue; - } - } - rcu_read_unlock(); - - xfs_iflock(ip); - xfs_iflags_set(ip, XFS_ISTALE); - /* - * we don't need to attach clean inodes or those only - * with unlogged changes (which we throw away, anyway). - */ iip = ip->i_itemp; - if (!iip || xfs_inode_clean(ip)) { - ASSERT(ip != free_ip); - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } - iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; iip->ili_fsync_fields = 0; @@ -3807,7 +3802,6 @@ xfs_iflush_int( ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); ASSERT(iip != NULL && iip->ili_fields != 0); - ASSERT(ip->i_d.di_version > 1); /* set *dip = inode's place in the buffer */ dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); @@ -3868,7 +3862,7 @@ xfs_iflush_int( * backwards compatibility with old kernels that predate logging all * inode changes. */ - if (ip->i_d.di_version < 3) + if (!xfs_sb_version_has_v3inode(&mp->m_sb)) ip->i_d.di_flushiter++; /* Check the inline fork data before we write out. */ @@ -3951,3 +3945,22 @@ xfs_irele( trace_xfs_irele(ip, _RET_IP_); iput(VFS_I(ip)); } + +/* + * Ensure all commited transactions touching the inode are written to the log. + */ +int +xfs_log_force_inode( + struct xfs_inode *ip) +{ + xfs_lsn_t lsn = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!lsn) + return 0; + return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL); +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 492e53992fa9..c6a63f6764a6 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -426,6 +426,7 @@ int xfs_itruncate_extents_flags(struct xfs_trans **, struct xfs_inode *, int, xfs_fsize_t, int); void xfs_iext_realloc(xfs_inode_t *, int, int); +int xfs_log_force_inode(struct xfs_inode *ip); void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 8bd5d0de6321..f779cca2346f 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -125,7 +125,7 @@ xfs_inode_item_size( *nvecs += 2; *nbytes += sizeof(struct xfs_inode_log_format) + - xfs_log_dinode_size(ip->i_d.di_version); + xfs_log_dinode_size(ip->i_mount); xfs_inode_item_data_fork_size(iip, nvecs, nbytes); if (XFS_IFORK_Q(ip)) @@ -305,11 +305,9 @@ xfs_inode_to_log_dinode( struct inode *inode = VFS_I(ip); to->di_magic = XFS_DINODE_MAGIC; - - to->di_version = from->di_version; to->di_format = from->di_format; - to->di_uid = from->di_uid; - to->di_gid = from->di_gid; + to->di_uid = i_uid_read(inode); + to->di_gid = i_gid_read(inode); to->di_projid_lo = from->di_projid & 0xffff; to->di_projid_hi = from->di_projid >> 16; @@ -339,7 +337,8 @@ xfs_inode_to_log_dinode( /* log a dummy value to ensure log structure is fully initialised */ to->di_next_unlinked = NULLAGINO; - if (from->di_version == 3) { + if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + to->di_version = 3; to->di_changecount = inode_peek_iversion(inode); to->di_crtime.t_sec = from->di_crtime.tv_sec; to->di_crtime.t_nsec = from->di_crtime.tv_nsec; @@ -351,6 +350,7 @@ xfs_inode_to_log_dinode( uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); to->di_flushiter = 0; } else { + to->di_version = 2; to->di_flushiter = from->di_flushiter; } } @@ -370,7 +370,7 @@ xfs_inode_item_format_core( dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE); xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn); - xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_d.di_version)); + xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_mount)); } /* @@ -395,8 +395,6 @@ xfs_inode_item_format( struct xfs_log_iovec *vecp = NULL; struct xfs_inode_log_format *ilf; - ASSERT(ip->i_d.di_version > 1); - ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT); ilf->ilf_type = XFS_LI_INODE; ilf->ilf_ino = ip->i_ino; @@ -554,7 +552,8 @@ xfs_inode_item_push( if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); - } + } else if (error == -EAGAIN) + rval = XFS_ITEM_LOCKED; spin_lock(&lip->li_ailp->ail_lock); out_unlock: @@ -732,29 +731,27 @@ xfs_iflush_done( * holding the lock before removing the inode from the AIL. */ if (need_ail) { - bool mlip_changed = false; + xfs_lsn_t tail_lsn = 0; /* this is an opencoded batch version of xfs_trans_ail_delete */ spin_lock(&ailp->ail_lock); list_for_each_entry(blip, &tmp, li_bio_list) { if (INODE_ITEM(blip)->ili_logged && - blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) - mlip_changed |= xfs_ail_delete_one(ailp, blip); - else { + blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) { + /* + * xfs_ail_update_finish() only cares about the + * lsn of the first tail item removed, any + * others will be at the same or higher lsn so + * we just ignore them. + */ + xfs_lsn_t lsn = xfs_ail_delete_one(ailp, blip); + if (!tail_lsn && lsn) + tail_lsn = lsn; + } else { xfs_clear_li_failed(blip); } } - - if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount)) - xlog_assign_tail_lsn_locked(ailp->ail_mount); - if (list_empty(&ailp->ail_head)) - wake_up_all(&ailp->ail_empty); - } - spin_unlock(&ailp->ail_lock); - - if (mlip_changed) - xfs_log_space_wake(ailp->ail_mount); + xfs_ail_update_finish(ailp, tail_lsn); } /* diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d42de92cb283..cdfb3cd9a25b 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -35,6 +35,8 @@ #include "xfs_health.h" #include "xfs_reflink.h" #include "xfs_ioctl.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include <linux/mount.h> #include <linux/namei.h> @@ -292,62 +294,173 @@ xfs_readlink_by_handle( return error; } -STATIC int -xfs_attrlist_by_handle( - struct file *parfilp, - void __user *arg) +/* + * Format an attribute and copy it out to the user's buffer. + * Take care to check values and protect against them changing later, + * we may be reading them directly out of a user buffer. + */ +static void +xfs_ioc_attr_put_listent( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + int valuelen) { - int error = -ENOMEM; - attrlist_cursor_kern_t *cursor; - struct xfs_fsop_attrlist_handlereq __user *p = arg; - xfs_fsop_attrlist_handlereq_t al_hreq; - struct dentry *dentry; - char *kbuf; + struct xfs_attrlist *alist = context->buffer; + struct xfs_attrlist_ent *aep; + int arraytop; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) - return -EFAULT; - if (al_hreq.buflen < sizeof(struct attrlist) || - al_hreq.buflen > XFS_XATTR_LIST_MAX) + ASSERT(!context->seen_enough); + ASSERT(context->count >= 0); + ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); + ASSERT(context->firstu >= sizeof(*alist)); + ASSERT(context->firstu <= context->bufsize); + + /* + * Only list entries in the right namespace. + */ + if (context->attr_filter != (flags & XFS_ATTR_NSP_ONDISK_MASK)) + return; + + arraytop = sizeof(*alist) + + context->count * sizeof(alist->al_offset[0]); + + /* decrement by the actual bytes used by the attr */ + context->firstu -= round_up(offsetof(struct xfs_attrlist_ent, a_name) + + namelen + 1, sizeof(uint32_t)); + if (context->firstu < arraytop) { + trace_xfs_attr_list_full(context); + alist->al_more = 1; + context->seen_enough = 1; + return; + } + + aep = context->buffer + context->firstu; + aep->a_valuelen = valuelen; + memcpy(aep->a_name, name, namelen); + aep->a_name[namelen] = 0; + alist->al_offset[context->count++] = context->firstu; + alist->al_count = context->count; + trace_xfs_attr_list_add(context); +} + +static unsigned int +xfs_attr_filter( + u32 ioc_flags) +{ + if (ioc_flags & XFS_IOC_ATTR_ROOT) + return XFS_ATTR_ROOT; + if (ioc_flags & XFS_IOC_ATTR_SECURE) + return XFS_ATTR_SECURE; + return 0; +} + +static unsigned int +xfs_attr_flags( + u32 ioc_flags) +{ + if (ioc_flags & XFS_IOC_ATTR_CREATE) + return XATTR_CREATE; + if (ioc_flags & XFS_IOC_ATTR_REPLACE) + return XATTR_REPLACE; + return 0; +} + +int +xfs_ioc_attr_list( + struct xfs_inode *dp, + void __user *ubuf, + int bufsize, + int flags, + struct xfs_attrlist_cursor __user *ucursor) +{ + struct xfs_attr_list_context context = { }; + struct xfs_attrlist *alist; + void *buffer; + int error; + + if (bufsize < sizeof(struct xfs_attrlist) || + bufsize > XFS_XATTR_LIST_MAX) return -EINVAL; /* * Reject flags, only allow namespaces. */ - if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) + if (flags & ~(XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE)) + return -EINVAL; + if (flags == (XFS_IOC_ATTR_ROOT | XFS_IOC_ATTR_SECURE)) return -EINVAL; - dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); + /* + * Validate the cursor. + */ + if (copy_from_user(&context.cursor, ucursor, sizeof(context.cursor))) + return -EFAULT; + if (context.cursor.pad1 || context.cursor.pad2) + return -EINVAL; + if (!context.cursor.initted && + (context.cursor.hashval || context.cursor.blkno || + context.cursor.offset)) + return -EINVAL; - kbuf = kmem_zalloc_large(al_hreq.buflen, 0); - if (!kbuf) - goto out_dput; + buffer = kmem_zalloc_large(bufsize, 0); + if (!buffer) + return -ENOMEM; - cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen, - al_hreq.flags, cursor); + /* + * Initialize the output buffer. + */ + context.dp = dp; + context.resynch = 1; + context.attr_filter = xfs_attr_filter(flags); + context.buffer = buffer; + context.bufsize = round_down(bufsize, sizeof(uint32_t)); + context.firstu = context.bufsize; + context.put_listent = xfs_ioc_attr_put_listent; + + alist = context.buffer; + alist->al_count = 0; + alist->al_more = 0; + alist->al_offset[0] = context.bufsize; + + error = xfs_attr_list(&context); if (error) - goto out_kfree; + goto out_free; - if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) { + if (copy_to_user(ubuf, buffer, bufsize) || + copy_to_user(ucursor, &context.cursor, sizeof(context.cursor))) error = -EFAULT; - goto out_kfree; - } +out_free: + kmem_free(buffer); + return error; +} - if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen)) - error = -EFAULT; +STATIC int +xfs_attrlist_by_handle( + struct file *parfilp, + struct xfs_fsop_attrlist_handlereq __user *p) +{ + struct xfs_fsop_attrlist_handlereq al_hreq; + struct dentry *dentry; + int error = -ENOMEM; -out_kfree: - kmem_free(kbuf); -out_dput: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&al_hreq, p, sizeof(al_hreq))) + return -EFAULT; + + dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), al_hreq.buffer, + al_hreq.buflen, al_hreq.flags, &p->pos); dput(dentry); return error; } -int +static int xfs_attrmulti_attr_get( struct inode *inode, unsigned char *name, @@ -355,31 +468,33 @@ xfs_attrmulti_attr_get( uint32_t *len, uint32_t flags) { - unsigned char *kbuf; - int error = -EFAULT; - size_t namelen; + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = xfs_attr_filter(flags), + .attr_flags = xfs_attr_flags(flags), + .name = name, + .namelen = strlen(name), + .valuelen = *len, + }; + int error; if (*len > XFS_XATTR_SIZE_MAX) return -EINVAL; - kbuf = kmem_zalloc_large(*len, 0); - if (!kbuf) - return -ENOMEM; - namelen = strlen(name); - error = xfs_attr_get(XFS_I(inode), name, namelen, &kbuf, (int *)len, - flags); + error = xfs_attr_get(&args); if (error) goto out_kfree; - if (copy_to_user(ubuf, kbuf, *len)) + *len = args.valuelen; + if (copy_to_user(ubuf, args.value, args.valuelen)) error = -EFAULT; out_kfree: - kmem_free(kbuf); + kmem_free(args.value); return error; } -int +static int xfs_attrmulti_attr_set( struct inode *inode, unsigned char *name, @@ -387,42 +502,75 @@ xfs_attrmulti_attr_set( uint32_t len, uint32_t flags) { - unsigned char *kbuf; + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = xfs_attr_filter(flags), + .attr_flags = xfs_attr_flags(flags), + .name = name, + .namelen = strlen(name), + }; int error; - size_t namelen; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; - if (len > XFS_XATTR_SIZE_MAX) - return -EINVAL; - kbuf = memdup_user(ubuf, len); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); + if (ubuf) { + if (len > XFS_XATTR_SIZE_MAX) + return -EINVAL; + args.value = memdup_user(ubuf, len); + if (IS_ERR(args.value)) + return PTR_ERR(args.value); + args.valuelen = len; + } - namelen = strlen(name); - error = xfs_attr_set(XFS_I(inode), name, namelen, kbuf, len, flags); - if (!error) - xfs_forget_acl(inode, name, flags); - kfree(kbuf); + error = xfs_attr_set(&args); + if (!error && (flags & XFS_IOC_ATTR_ROOT)) + xfs_forget_acl(inode, name); + kfree(args.value); return error; } int -xfs_attrmulti_attr_remove( +xfs_ioc_attrmulti_one( + struct file *parfilp, struct inode *inode, - unsigned char *name, + uint32_t opcode, + void __user *uname, + void __user *value, + uint32_t *len, uint32_t flags) { + unsigned char *name; int error; - size_t namelen; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - namelen = strlen(name); - error = xfs_attr_remove(XFS_I(inode), name, namelen, flags); - if (!error) - xfs_forget_acl(inode, name, flags); + if ((flags & XFS_IOC_ATTR_ROOT) && (flags & XFS_IOC_ATTR_SECURE)) + return -EINVAL; + + name = strndup_user(uname, MAXNAMELEN); + if (IS_ERR(name)) + return PTR_ERR(name); + + switch (opcode) { + case ATTR_OP_GET: + error = xfs_attrmulti_attr_get(inode, name, value, len, flags); + break; + case ATTR_OP_REMOVE: + value = NULL; + *len = 0; + /* fall through */ + case ATTR_OP_SET: + error = mnt_want_write_file(parfilp); + if (error) + break; + error = xfs_attrmulti_attr_set(inode, name, value, *len, flags); + mnt_drop_write_file(parfilp); + break; + default: + error = -EINVAL; + break; + } + + kfree(name); return error; } @@ -436,7 +584,6 @@ xfs_attrmulti_by_handle( xfs_fsop_attrmulti_handlereq_t am_hreq; struct dentry *dentry; unsigned int i, size; - unsigned char *attr_name; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -462,63 +609,17 @@ xfs_attrmulti_by_handle( goto out_dput; } - error = -ENOMEM; - attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); - if (!attr_name) - goto out_kfree_ops; - error = 0; for (i = 0; i < am_hreq.opcount; i++) { - if ((ops[i].am_flags & ATTR_ROOT) && - (ops[i].am_flags & ATTR_SECURE)) { - ops[i].am_error = -EINVAL; - continue; - } - ops[i].am_flags &= ~ATTR_KERNEL_FLAGS; - - ops[i].am_error = strncpy_from_user((char *)attr_name, - ops[i].am_attrname, MAXNAMELEN); - if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) - error = -ERANGE; - if (ops[i].am_error < 0) - break; - - switch (ops[i].am_opcode) { - case ATTR_OP_GET: - ops[i].am_error = xfs_attrmulti_attr_get( - d_inode(dentry), attr_name, - ops[i].am_attrvalue, &ops[i].am_length, - ops[i].am_flags); - break; - case ATTR_OP_SET: - ops[i].am_error = mnt_want_write_file(parfilp); - if (ops[i].am_error) - break; - ops[i].am_error = xfs_attrmulti_attr_set( - d_inode(dentry), attr_name, - ops[i].am_attrvalue, ops[i].am_length, - ops[i].am_flags); - mnt_drop_write_file(parfilp); - break; - case ATTR_OP_REMOVE: - ops[i].am_error = mnt_want_write_file(parfilp); - if (ops[i].am_error) - break; - ops[i].am_error = xfs_attrmulti_attr_remove( - d_inode(dentry), attr_name, - ops[i].am_flags); - mnt_drop_write_file(parfilp); - break; - default: - ops[i].am_error = -EINVAL; - } + ops[i].am_error = xfs_ioc_attrmulti_one(parfilp, + d_inode(dentry), ops[i].am_opcode, + ops[i].am_attrname, ops[i].am_attrvalue, + &ops[i].am_length, ops[i].am_flags); } if (copy_to_user(am_hreq.ops, ops, size)) error = -EFAULT; - kfree(attr_name); - out_kfree_ops: kfree(ops); out_dput: dput(dentry); @@ -1162,7 +1263,7 @@ xfs_ioctl_setattr_xflags( /* diflags2 only valid for v3 inodes. */ di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); - if (di_flags2 && ip->i_d.di_version < 3) + if (di_flags2 && !xfs_sb_version_has_v3inode(&mp->m_sb)) return -EINVAL; ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags); @@ -1372,8 +1473,7 @@ xfs_ioctl_setattr_check_cowextsize( if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)) return 0; - if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb) || - ip->i_d.di_version != 3) + if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb)) return -EINVAL; if (fa->fsx_cowextsize == 0) @@ -1434,9 +1534,9 @@ xfs_ioctl_setattr( * because the i_*dquot fields will get updated anyway. */ if (XFS_IS_QUOTA_ON(mp)) { - code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid, - ip->i_d.di_gid, fa->fsx_projid, - XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); + code = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid, + VFS_I(ip)->i_gid, fa->fsx_projid, + XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); if (code) return code; } @@ -1501,7 +1601,6 @@ xfs_ioctl_setattr( olddquot = xfs_qm_vop_chown(tp, ip, &ip->i_pdquot, pdqp); } - ASSERT(ip->i_d.di_version > 1); ip->i_d.di_projid = fa->fsx_projid; } @@ -1514,7 +1613,7 @@ xfs_ioctl_setattr( ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; else ip->i_d.di_extsize = 0; - if (ip->i_d.di_version == 3 && + if (xfs_sb_version_has_v3inode(&mp->m_sb) && (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) ip->i_d.di_cowextsize = fa->fsx_cowextsize >> mp->m_sb.sb_blocklog; diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 420bd95dc326..bab6a5a92407 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -6,6 +6,11 @@ #ifndef __XFS_IOCTL_H__ #define __XFS_IOCTL_H__ +struct xfs_bstat; +struct xfs_ibulk; +struct xfs_inogrp; + + extern int xfs_ioc_space( struct file *filp, @@ -30,27 +35,11 @@ xfs_readlink_by_handle( struct file *parfilp, xfs_fsop_handlereq_t *hreq); -extern int -xfs_attrmulti_attr_get( - struct inode *inode, - unsigned char *name, - unsigned char __user *ubuf, - uint32_t *len, - uint32_t flags); - -extern int -xfs_attrmulti_attr_set( - struct inode *inode, - unsigned char *name, - const unsigned char __user *ubuf, - uint32_t len, - uint32_t flags); - -extern int -xfs_attrmulti_attr_remove( - struct inode *inode, - unsigned char *name, - uint32_t flags); +int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode, + uint32_t opcode, void __user *uname, void __user *value, + uint32_t *len, uint32_t flags); +int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, int bufsize, + int flags, struct xfs_attrlist_cursor __user *ucursor); extern struct dentry * xfs_handle_to_dentry( @@ -70,10 +59,6 @@ xfs_file_compat_ioctl( unsigned int cmd, unsigned long arg); -struct xfs_ibulk; -struct xfs_bstat; -struct xfs_inogrp; - int xfs_fsbulkstat_one_fmt(struct xfs_ibulk *breq, const struct xfs_bulkstat *bstat); int xfs_fsinumbers_fmt(struct xfs_ibulk *breq, const struct xfs_inumbers *igrp); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 769581a79c58..c1771e728117 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -352,56 +352,24 @@ xfs_compat_handlereq_to_dentry( STATIC int xfs_compat_attrlist_by_handle( struct file *parfilp, - void __user *arg) + compat_xfs_fsop_attrlist_handlereq_t __user *p) { - int error; - attrlist_cursor_kern_t *cursor; - compat_xfs_fsop_attrlist_handlereq_t __user *p = arg; compat_xfs_fsop_attrlist_handlereq_t al_hreq; struct dentry *dentry; - char *kbuf; + int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (copy_from_user(&al_hreq, arg, - sizeof(compat_xfs_fsop_attrlist_handlereq_t))) + if (copy_from_user(&al_hreq, p, sizeof(al_hreq))) return -EFAULT; - if (al_hreq.buflen < sizeof(struct attrlist) || - al_hreq.buflen > XFS_XATTR_LIST_MAX) - return -EINVAL; - - /* - * Reject flags, only allow namespaces. - */ - if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) - return -EINVAL; dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq); if (IS_ERR(dentry)) return PTR_ERR(dentry); - error = -ENOMEM; - kbuf = kmem_zalloc_large(al_hreq.buflen, 0); - if (!kbuf) - goto out_dput; - - cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen, - al_hreq.flags, cursor); - if (error) - goto out_kfree; - - if (copy_to_user(&p->pos, cursor, sizeof(attrlist_cursor_kern_t))) { - error = -EFAULT; - goto out_kfree; - } - - if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen)) - error = -EFAULT; - -out_kfree: - kmem_free(kbuf); -out_dput: + error = xfs_ioc_attr_list(XFS_I(d_inode(dentry)), + compat_ptr(al_hreq.buffer), al_hreq.buflen, + al_hreq.flags, &p->pos); dput(dentry); return error; } @@ -416,7 +384,6 @@ xfs_compat_attrmulti_by_handle( compat_xfs_fsop_attrmulti_handlereq_t am_hreq; struct dentry *dentry; unsigned int i, size; - unsigned char *attr_name; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -443,64 +410,18 @@ xfs_compat_attrmulti_by_handle( goto out_dput; } - error = -ENOMEM; - attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); - if (!attr_name) - goto out_kfree_ops; - error = 0; for (i = 0; i < am_hreq.opcount; i++) { - if ((ops[i].am_flags & ATTR_ROOT) && - (ops[i].am_flags & ATTR_SECURE)) { - ops[i].am_error = -EINVAL; - continue; - } - ops[i].am_flags &= ~ATTR_KERNEL_FLAGS; - - ops[i].am_error = strncpy_from_user((char *)attr_name, + ops[i].am_error = xfs_ioc_attrmulti_one(parfilp, + d_inode(dentry), ops[i].am_opcode, compat_ptr(ops[i].am_attrname), - MAXNAMELEN); - if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) - error = -ERANGE; - if (ops[i].am_error < 0) - break; - - switch (ops[i].am_opcode) { - case ATTR_OP_GET: - ops[i].am_error = xfs_attrmulti_attr_get( - d_inode(dentry), attr_name, - compat_ptr(ops[i].am_attrvalue), - &ops[i].am_length, ops[i].am_flags); - break; - case ATTR_OP_SET: - ops[i].am_error = mnt_want_write_file(parfilp); - if (ops[i].am_error) - break; - ops[i].am_error = xfs_attrmulti_attr_set( - d_inode(dentry), attr_name, - compat_ptr(ops[i].am_attrvalue), - ops[i].am_length, ops[i].am_flags); - mnt_drop_write_file(parfilp); - break; - case ATTR_OP_REMOVE: - ops[i].am_error = mnt_want_write_file(parfilp); - if (ops[i].am_error) - break; - ops[i].am_error = xfs_attrmulti_attr_remove( - d_inode(dentry), attr_name, - ops[i].am_flags); - mnt_drop_write_file(parfilp); - break; - default: - ops[i].am_error = -EINVAL; - } + compat_ptr(ops[i].am_attrvalue), + &ops[i].am_length, ops[i].am_flags); } if (copy_to_user(compat_ptr(am_hreq.ops), ops, size)) error = -EFAULT; - kfree(attr_name); - out_kfree_ops: kfree(ops); out_dput: dput(dentry); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 81f2f93caec0..f7a99b3bbcf7 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -22,7 +22,6 @@ #include "xfs_iomap.h" #include "xfs_error.h" -#include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/security.h> #include <linux/iversion.h> @@ -50,10 +49,15 @@ xfs_initxattrs( int error = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - error = xfs_attr_set(ip, xattr->name, - strlen(xattr->name), - xattr->value, xattr->value_len, - ATTR_SECURE); + struct xfs_da_args args = { + .dp = ip, + .attr_filter = XFS_ATTR_SECURE, + .name = xattr->name, + .namelen = strlen(xattr->name), + .value = xattr->value, + .valuelen = xattr->value_len, + }; + error = xfs_attr_set(&args); if (error < 0) break; } @@ -553,7 +557,7 @@ xfs_vn_getattr( stat->blocks = XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); - if (ip->i_d.di_version == 3) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime = ip->i_d.di_crtime; @@ -692,9 +696,7 @@ xfs_setattr_nonsize( */ ASSERT(udqp == NULL); ASSERT(gdqp == NULL); - error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid), - xfs_kgid_to_gid(gid), - ip->i_d.di_projid, + error = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid, qflags, &udqp, &gdqp, NULL); if (error) return error; @@ -763,7 +765,6 @@ xfs_setattr_nonsize( olddquot1 = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp); } - ip->i_d.di_uid = xfs_kuid_to_uid(uid); inode->i_uid = uid; } if (!gid_eq(igid, gid)) { @@ -775,7 +776,6 @@ xfs_setattr_nonsize( olddquot2 = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } - ip->i_d.di_gid = xfs_kgid_to_gid(gid); inode->i_gid = gid; } } @@ -1304,9 +1304,6 @@ xfs_setup_inode( /* make the inode look hashed for the writeback code */ inode_fake_hash(inode); - inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid); - inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid); - i_size_write(inode, ip->i_d.di_size); xfs_diflags_to_iflags(inode, ip); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 4b31c29b7e6b..ff2da28fed90 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -86,8 +86,8 @@ xfs_bulkstat_one_int( */ buf->bs_projectid = ip->i_d.di_projid; buf->bs_ino = ino; - buf->bs_uid = dic->di_uid; - buf->bs_gid = dic->di_gid; + buf->bs_uid = i_uid_read(inode); + buf->bs_gid = i_gid_read(inode); buf->bs_size = dic->di_size; buf->bs_nlink = inode->i_nlink; @@ -110,7 +110,7 @@ xfs_bulkstat_one_int( buf->bs_forkoff = XFS_IFORK_BOFF(ip); buf->bs_version = XFS_BULKSTAT_VERSION_V5; - if (dic->di_version == 3) { + if (xfs_sb_version_has_v3inode(&mp->m_sb)) { if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE) buf->bs_cowextsize_blks = dic->di_cowextsize; } diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 8738bb03f253..9f70d2f68e05 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -60,6 +60,7 @@ typedef __u32 xfs_nlink_t; #include <linux/list_sort.h> #include <linux/ratelimit.h> #include <linux/rhashtable.h> +#include <linux/xattr.h> #include <asm/page.h> #include <asm/div64.h> @@ -163,32 +164,6 @@ struct xstats { extern struct xstats xfsstats; -/* Kernel uid/gid conversion. These are used to convert to/from the on disk - * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally. - * The conversion here is type only, the value will remain the same since we - * are converting to the init_user_ns. The uid is later mapped to a particular - * user namespace value when crossing the kernel/user boundary. - */ -static inline uint32_t xfs_kuid_to_uid(kuid_t uid) -{ - return from_kuid(&init_user_ns, uid); -} - -static inline kuid_t xfs_uid_to_kuid(uint32_t uid) -{ - return make_kuid(&init_user_ns, uid); -} - -static inline uint32_t xfs_kgid_to_gid(kgid_t gid) -{ - return from_kgid(&init_user_ns, gid); -} - -static inline kgid_t xfs_gid_to_kgid(uint32_t gid) -{ - return make_kgid(&init_user_ns, gid); -} - static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev) { return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev)); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f6006d94a581..00fda2e8e738 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -24,13 +24,6 @@ kmem_zone_t *xfs_log_ticket_zone; /* Local miscellaneous function prototypes */ -STATIC int -xlog_commit_record( - struct xlog *log, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - xfs_lsn_t *commitlsnp); - STATIC struct xlog * xlog_alloc_log( struct xfs_mount *mp, @@ -47,8 +40,7 @@ xlog_dealloc_log( /* local state machine functions */ STATIC void xlog_state_done_syncing( - struct xlog_in_core *iclog, - bool aborted); + struct xlog_in_core *iclog); STATIC int xlog_state_get_iclog_space( struct xlog *log, @@ -63,23 +55,10 @@ xlog_state_switch_iclogs( struct xlog_in_core *iclog, int eventual_size); STATIC void -xlog_state_want_sync( - struct xlog *log, - struct xlog_in_core *iclog); - -STATIC void xlog_grant_push_ail( struct xlog *log, int need_bytes); STATIC void -xlog_regrant_reserve_log_space( - struct xlog *log, - struct xlog_ticket *ticket); -STATIC void -xlog_ungrant_log_space( - struct xlog *log, - struct xlog_ticket *ticket); -STATIC void xlog_sync( struct xlog *log, struct xlog_in_core *iclog); @@ -484,73 +463,6 @@ out_error: return error; } - -/* - * NOTES: - * - * 1. currblock field gets updated at startup and after in-core logs - * marked as with WANT_SYNC. - */ - -/* - * This routine is called when a user of a log manager ticket is done with - * the reservation. If the ticket was ever used, then a commit record for - * the associated transaction is written out as a log operation header with - * no data. The flag XLOG_TIC_INITED is set when the first write occurs with - * a given ticket. If the ticket was one with a permanent reservation, then - * a few operations are done differently. Permanent reservation tickets by - * default don't release the reservation. They just commit the current - * transaction with the belief that the reservation is still needed. A flag - * must be passed in before permanent reservations are actually released. - * When these type of tickets are not released, they need to be set into - * the inited state again. By doing this, a start record will be written - * out when the next write occurs. - */ -xfs_lsn_t -xfs_log_done( - struct xfs_mount *mp, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - bool regrant) -{ - struct xlog *log = mp->m_log; - xfs_lsn_t lsn = 0; - - if (XLOG_FORCED_SHUTDOWN(log) || - /* - * If nothing was ever written, don't write out commit record. - * If we get an error, just continue and give back the log ticket. - */ - (((ticket->t_flags & XLOG_TIC_INITED) == 0) && - (xlog_commit_record(log, ticket, iclog, &lsn)))) { - lsn = (xfs_lsn_t) -1; - regrant = false; - } - - - if (!regrant) { - trace_xfs_log_done_nonperm(log, ticket); - - /* - * Release ticket if not permanent reservation or a specific - * request has been made to release a permanent reservation. - */ - xlog_ungrant_log_space(log, ticket); - } else { - trace_xfs_log_done_perm(log, ticket); - - xlog_regrant_reserve_log_space(log, ticket); - /* If this ticket was a permanent reservation and we aren't - * trying to release it, reset the inited flags; so next time - * we write, a start record will be written out. - */ - ticket->t_flags |= XLOG_TIC_INITED; - } - - xfs_log_ticket_put(ticket); - return lsn; -} - static bool __xlog_state_release_iclog( struct xlog *log, @@ -597,26 +509,21 @@ xlog_state_release_iclog( return 0; } -int +void xfs_log_release_iclog( - struct xfs_mount *mp, struct xlog_in_core *iclog) { - struct xlog *log = mp->m_log; - bool sync; - - if (iclog->ic_state == XLOG_STATE_IOERROR) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - return -EIO; - } + struct xlog *log = iclog->ic_log; + bool sync = false; if (atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) { - sync = __xlog_state_release_iclog(log, iclog); + if (iclog->ic_state != XLOG_STATE_IOERROR) + sync = __xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); - if (sync) - xlog_sync(log, iclog); } - return 0; + + if (sync) + xlog_sync(log, iclog); } /* @@ -855,32 +762,69 @@ xfs_log_mount_cancel( } /* - * Final log writes as part of unmount. - * - * Mark the filesystem clean as unmount happens. Note that during relocation - * this routine needs to be executed as part of source-bag while the - * deallocation must not be done until source-end. + * Wait for the iclog to be written disk, or return an error if the log has been + * shut down. */ +static int +xlog_wait_on_iclog( + struct xlog_in_core *iclog) + __releases(iclog->ic_log->l_icloglock) +{ + struct xlog *log = iclog->ic_log; -/* Actually write the unmount record to disk. */ -static void -xfs_log_write_unmount_record( - struct xfs_mount *mp) + if (!XLOG_FORCED_SHUTDOWN(log) && + iclog->ic_state != XLOG_STATE_ACTIVE && + iclog->ic_state != XLOG_STATE_DIRTY) { + XFS_STATS_INC(log->l_mp, xs_log_force_sleep); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + } else { + spin_unlock(&log->l_icloglock); + } + + if (XLOG_FORCED_SHUTDOWN(log)) + return -EIO; + return 0; +} + +/* + * Write out an unmount record using the ticket provided. We have to account for + * the data space used in the unmount ticket as this write is not done from a + * transaction context that has already done the accounting for us. + */ +static int +xlog_write_unmount_record( + struct xlog *log, + struct xlog_ticket *ticket, + xfs_lsn_t *lsn, + uint flags) { - /* the data section must be 32 bit size aligned */ - struct xfs_unmount_log_format magic = { + struct xfs_unmount_log_format ulf = { .magic = XLOG_UNMOUNT_TYPE, }; struct xfs_log_iovec reg = { - .i_addr = &magic, - .i_len = sizeof(magic), + .i_addr = &ulf, + .i_len = sizeof(ulf), .i_type = XLOG_REG_TYPE_UNMOUNT, }; struct xfs_log_vec vec = { .lv_niovecs = 1, .lv_iovecp = ®, }; - struct xlog *log = mp->m_log; + + /* account for space used by record data */ + ticket->t_curr_res -= sizeof(ulf); + return xlog_write(log, &vec, ticket, lsn, NULL, flags, false); +} + +/* + * Mark the filesystem clean by writing an unmount record to the head of the + * log. + */ +static void +xlog_unmount_write( + struct xlog *log) +{ + struct xfs_mount *mp = log->l_mp; struct xlog_in_core *iclog; struct xlog_ticket *tic = NULL; xfs_lsn_t lsn; @@ -891,23 +835,7 @@ xfs_log_write_unmount_record( if (error) goto out_err; - /* - * If we think the summary counters are bad, clear the unmount header - * flag in the unmount record so that the summary counters will be - * recalculated during log recovery at next mount. Refer to - * xlog_check_unmount_rec for more details. - */ - if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, - XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { - xfs_alert(mp, "%s: will fix summary counters at next mount", - __func__); - flags &= ~XLOG_UNMOUNT_TRANS; - } - - /* remove inited flag, and account for space used */ - tic->t_flags = 0; - tic->t_curr_res -= sizeof(magic); - error = xlog_write(log, &vec, tic, &lsn, NULL, flags); + error = xlog_write_unmount_record(log, tic, &lsn, flags); /* * At this point, we're umounting anyway, so there's no point in * transitioning log state to IOERROR. Just continue... @@ -919,28 +847,32 @@ out_err: spin_lock(&log->l_icloglock); iclog = log->l_iclog; atomic_inc(&iclog->ic_refcnt); - xlog_state_want_sync(log, iclog); + if (iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(log, iclog, 0); + else + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_state == XLOG_STATE_IOERROR); error = xlog_state_release_iclog(log, iclog); - switch (iclog->ic_state) { - default: - if (!XLOG_FORCED_SHUTDOWN(log)) { - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - break; - } - /* fall through */ - case XLOG_STATE_ACTIVE: - case XLOG_STATE_DIRTY: - spin_unlock(&log->l_icloglock); - break; - } + xlog_wait_on_iclog(iclog); if (tic) { trace_xfs_log_umount_write(log, tic); - xlog_ungrant_log_space(log, tic); - xfs_log_ticket_put(tic); + xfs_log_ticket_ungrant(log, tic); } } +static void +xfs_log_unmount_verify_iclog( + struct xlog *log) +{ + struct xlog_in_core *iclog = log->l_iclog; + + do { + ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + ASSERT(iclog->ic_offset == 0); + } while ((iclog = iclog->ic_next) != log->l_iclog); +} + /* * Unmount record used to have a string "Unmount filesystem--" in the * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). @@ -948,16 +880,11 @@ out_err: * currently architecture converted and "Unmount" is a bit foo. * As far as I know, there weren't any dependencies on the old behaviour. */ - -static int -xfs_log_unmount_write(xfs_mount_t *mp) +static void +xfs_log_unmount_write( + struct xfs_mount *mp) { - struct xlog *log = mp->m_log; - xlog_in_core_t *iclog; -#ifdef DEBUG - xlog_in_core_t *first_iclog; -#endif - int error; + struct xlog *log = mp->m_log; /* * Don't write out unmount record on norecovery mounts or ro devices. @@ -966,57 +893,30 @@ xfs_log_unmount_write(xfs_mount_t *mp) if (mp->m_flags & XFS_MOUNT_NORECOVERY || xfs_readonly_buftarg(log->l_targ)) { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); - return 0; + return; } - error = xfs_log_force(mp, XFS_LOG_SYNC); - ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); + xfs_log_force(mp, XFS_LOG_SYNC); -#ifdef DEBUG - first_iclog = iclog = log->l_iclog; - do { - if (iclog->ic_state != XLOG_STATE_IOERROR) { - ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); - ASSERT(iclog->ic_offset == 0); - } - iclog = iclog->ic_next; - } while (iclog != first_iclog); -#endif - if (! (XLOG_FORCED_SHUTDOWN(log))) { - xfs_log_write_unmount_record(mp); - } else { - /* - * We're already in forced_shutdown mode, couldn't - * even attempt to write out the unmount transaction. - * - * Go through the motions of sync'ing and releasing - * the iclog, even though no I/O will actually happen, - * we need to wait for other log I/Os that may already - * be in progress. Do this as a separate section of - * code so we'll know if we ever get stuck here that - * we're in this odd situation of trying to unmount - * a file system that went into forced_shutdown as - * the result of an unmount.. - */ - spin_lock(&log->l_icloglock); - iclog = log->l_iclog; - atomic_inc(&iclog->ic_refcnt); - xlog_state_want_sync(log, iclog); - error = xlog_state_release_iclog(log, iclog); - switch (iclog->ic_state) { - case XLOG_STATE_ACTIVE: - case XLOG_STATE_DIRTY: - case XLOG_STATE_IOERROR: - spin_unlock(&log->l_icloglock); - break; - default: - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - break; - } + if (XLOG_FORCED_SHUTDOWN(log)) + return; + + /* + * If we think the summary counters are bad, avoid writing the unmount + * record to force log recovery at next mount, after which the summary + * counters will be recalculated. Refer to xlog_check_unmount_rec for + * more details. + */ + if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, + XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { + xfs_alert(mp, "%s: will fix summary counters at next mount", + __func__); + return; } - return error; -} /* xfs_log_unmount_write */ + xfs_log_unmount_verify_iclog(log); + xlog_unmount_write(log); +} /* * Empty the log for unmount/freeze. @@ -1279,7 +1179,6 @@ xlog_ioend_work( struct xlog_in_core *iclog = container_of(work, struct xlog_in_core, ic_end_io_work); struct xlog *log = iclog->ic_log; - bool aborted = false; int error; error = blk_status_to_errno(iclog->ic_bio.bi_status); @@ -1295,17 +1194,9 @@ xlog_ioend_work( if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { xfs_alert(log->l_mp, "log I/O error %d", error); xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); - /* - * This flag will be propagated to the trans-committed - * callback routines to let them know that the log-commit - * didn't succeed. - */ - aborted = true; - } else if (iclog->ic_state == XLOG_STATE_IOERROR) { - aborted = true; } - xlog_state_done_syncing(iclog, aborted); + xlog_state_done_syncing(iclog); bio_uninit(&iclog->ic_bio); /* @@ -1551,20 +1442,17 @@ out: return ERR_PTR(error); } /* xlog_alloc_log */ - /* * Write out the commit record of a transaction associated with the given - * ticket. Return the lsn of the commit record. + * ticket to close off a running log write. Return the lsn of the commit record. */ -STATIC int +int xlog_commit_record( struct xlog *log, struct xlog_ticket *ticket, struct xlog_in_core **iclog, - xfs_lsn_t *commitlsnp) + xfs_lsn_t *lsn) { - struct xfs_mount *mp = log->l_mp; - int error; struct xfs_log_iovec reg = { .i_addr = NULL, .i_len = 0, @@ -1574,12 +1462,15 @@ xlog_commit_record( .lv_niovecs = 1, .lv_iovecp = ®, }; + int error; - ASSERT_ALWAYS(iclog); - error = xlog_write(log, &vec, ticket, commitlsnp, iclog, - XLOG_COMMIT_TRANS); + if (XLOG_FORCED_SHUTDOWN(log)) + return -EIO; + + error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS, + false); if (error) - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); return error; } @@ -1739,7 +1630,7 @@ xlog_bio_end_io( &iclog->ic_end_io_work); } -static void +static int xlog_map_iclog_data( struct bio *bio, void *data, @@ -1750,11 +1641,14 @@ xlog_map_iclog_data( unsigned int off = offset_in_page(data); size_t len = min_t(size_t, count, PAGE_SIZE - off); - WARN_ON_ONCE(bio_add_page(bio, page, len, off) != len); + if (bio_add_page(bio, page, len, off) != len) + return -EIO; data += len; count -= len; } while (count); + + return 0; } STATIC void @@ -1784,7 +1678,7 @@ xlog_write_iclog( * the buffer manually, the code needs to be kept in sync * with the I/O completion path. */ - xlog_state_done_syncing(iclog, true); + xlog_state_done_syncing(iclog); up(&iclog->ic_sema); return; } @@ -1794,11 +1688,22 @@ xlog_write_iclog( iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno; iclog->ic_bio.bi_end_io = xlog_bio_end_io; iclog->ic_bio.bi_private = iclog; - iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA; + + /* + * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more + * IOs coming immediately after this one. This prevents the block layer + * writeback throttle from throttling log writes behind background + * metadata writeback and causing priority inversions. + */ + iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | + REQ_IDLE | REQ_FUA; if (need_flush) iclog->ic_bio.bi_opf |= REQ_PREFLUSH; - xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count); + if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + return; + } if (is_vmalloc_addr(iclog->ic_data)) flush_kernel_vmap_range(iclog->ic_data, count); @@ -2011,7 +1916,7 @@ xlog_dealloc_log( log->l_mp->m_log = NULL; destroy_workqueue(log->l_ioend_workqueue); kmem_free(log); -} /* xlog_dealloc_log */ +} /* * Update counters atomically now that memcpy is done. @@ -2148,23 +2053,21 @@ xlog_print_trans( } /* - * Calculate the potential space needed by the log vector. Each region gets - * its own xlog_op_header_t and may need to be double word aligned. + * Calculate the potential space needed by the log vector. We may need a start + * record, and each region gets its own struct xlog_op_header and may need to be + * double word aligned. */ static int xlog_write_calc_vec_length( struct xlog_ticket *ticket, - struct xfs_log_vec *log_vector) + struct xfs_log_vec *log_vector, + bool need_start_rec) { struct xfs_log_vec *lv; - int headers = 0; + int headers = need_start_rec ? 1 : 0; int len = 0; int i; - /* acct for start rec of xact */ - if (ticket->t_flags & XLOG_TIC_INITED) - headers++; - for (lv = log_vector; lv; lv = lv->lv_next) { /* we don't write ordered log vectors */ if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) @@ -2186,27 +2089,16 @@ xlog_write_calc_vec_length( return len; } -/* - * If first write for transaction, insert start record We can't be trying to - * commit if we are inited. We can't have any "partial_copy" if we are inited. - */ -static int +static void xlog_write_start_rec( struct xlog_op_header *ophdr, struct xlog_ticket *ticket) { - if (!(ticket->t_flags & XLOG_TIC_INITED)) - return 0; - ophdr->oh_tid = cpu_to_be32(ticket->t_tid); ophdr->oh_clientid = ticket->t_clientid; ophdr->oh_len = 0; ophdr->oh_flags = XLOG_START_TRANS; ophdr->oh_res2 = 0; - - ticket->t_flags &= ~XLOG_TIC_INITED; - - return sizeof(struct xlog_op_header); } static xlog_op_header_t * @@ -2328,7 +2220,11 @@ xlog_write_copy_finish( *record_cnt = 0; *data_cnt = 0; - xlog_state_want_sync(log, iclog); + if (iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(log, iclog, 0); + else + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || + iclog->ic_state == XLOG_STATE_IOERROR); if (!commit_iclog) goto release_iclog; spin_unlock(&log->l_icloglock); @@ -2391,13 +2287,14 @@ xlog_write( struct xlog_ticket *ticket, xfs_lsn_t *start_lsn, struct xlog_in_core **commit_iclog, - uint flags) + uint flags, + bool need_start_rec) { struct xlog_in_core *iclog = NULL; - struct xfs_log_iovec *vecp; - struct xfs_log_vec *lv; + struct xfs_log_vec *lv = log_vector; + struct xfs_log_iovec *vecp = lv->lv_iovecp; + int index = 0; int len; - int index; int partial_copy = 0; int partial_copy_len = 0; int contwr = 0; @@ -2405,25 +2302,13 @@ xlog_write( int data_cnt = 0; int error = 0; - *start_lsn = 0; - - len = xlog_write_calc_vec_length(ticket, log_vector); - /* - * Region headers and bytes are already accounted for. - * We only need to take into account start records and - * split regions in this function. + * If this is a commit or unmount transaction, we don't need a start + * record to be written. We do, however, have to account for the + * commit or unmount header that gets written. Hence we always have + * to account for an extra xlog_op_header here. */ - if (ticket->t_flags & XLOG_TIC_INITED) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - - /* - * Commit record headers need to be accounted for. These - * come in as separate writes so are easy to detect. - */ - if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) - ticket->t_curr_res -= sizeof(xlog_op_header_t); - + ticket->t_curr_res -= sizeof(struct xlog_op_header); if (ticket->t_curr_res < 0) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); @@ -2431,9 +2316,8 @@ xlog_write( xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); } - index = 0; - lv = log_vector; - vecp = lv->lv_iovecp; + len = xlog_write_calc_vec_length(ticket, log_vector, need_start_rec); + *start_lsn = 0; while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { void *ptr; int log_offset; @@ -2457,7 +2341,6 @@ xlog_write( while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { struct xfs_log_iovec *reg; struct xlog_op_header *ophdr; - int start_rec_copy; int copy_len; int copy_off; bool ordered = false; @@ -2473,11 +2356,15 @@ xlog_write( ASSERT(reg->i_len % sizeof(int32_t) == 0); ASSERT((unsigned long)ptr % sizeof(int32_t) == 0); - start_rec_copy = xlog_write_start_rec(ptr, ticket); - if (start_rec_copy) { - record_cnt++; + /* + * Before we start formatting log vectors, we need to + * write a start record. Only do this for the first + * iclog we write to. + */ + if (need_start_rec) { + xlog_write_start_rec(ptr, ticket); xlog_write_adv_cnt(&ptr, &len, &log_offset, - start_rec_copy); + sizeof(struct xlog_op_header)); } ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); @@ -2509,8 +2396,13 @@ xlog_write( xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); } - copy_len += start_rec_copy + sizeof(xlog_op_header_t); + copy_len += sizeof(struct xlog_op_header); record_cnt++; + if (need_start_rec) { + copy_len += sizeof(struct xlog_op_header); + record_cnt++; + need_start_rec = false; + } data_cnt += contwr ? copy_len : 0; error = xlog_write_copy_finish(log, iclog, flags, @@ -2567,119 +2459,106 @@ next_lv: return error; } +static void +xlog_state_activate_iclog( + struct xlog_in_core *iclog, + int *iclogs_changed) +{ + ASSERT(list_empty_careful(&iclog->ic_callbacks)); -/***************************************************************************** - * - * State Machine functions - * - ***************************************************************************** - */ + /* + * If the number of ops in this iclog indicate it just contains the + * dummy transaction, we can change state into IDLE (the second time + * around). Otherwise we should change the state into NEED a dummy. + * We don't need to cover the dummy. + */ + if (*iclogs_changed == 0 && + iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) { + *iclogs_changed = 1; + } else { + /* + * We have two dirty iclogs so start over. This could also be + * num of ops indicating this is not the dummy going out. + */ + *iclogs_changed = 2; + } + + iclog->ic_state = XLOG_STATE_ACTIVE; + iclog->ic_offset = 0; + iclog->ic_header.h_num_logops = 0; + memset(iclog->ic_header.h_cycle_data, 0, + sizeof(iclog->ic_header.h_cycle_data)); + iclog->ic_header.h_lsn = 0; +} /* - * An iclog has just finished IO completion processing, so we need to update - * the iclog state and propagate that up into the overall log state. Hence we - * prepare the iclog for cleaning, and then clean all the pending dirty iclogs - * starting from the head, and then wake up any threads that are waiting for the - * iclog to be marked clean. - * - * The ordering of marking iclogs ACTIVE must be maintained, so an iclog - * doesn't become ACTIVE beyond one that is SYNCING. This is also required to - * maintain the notion that we use a ordered wait queue to hold off would be - * writers to the log when every iclog is trying to sync to disk. - * - * Caller must hold the icloglock before calling us. - * - * State Change: !IOERROR -> DIRTY -> ACTIVE + * Loop through all iclogs and mark all iclogs currently marked DIRTY as + * ACTIVE after iclog I/O has completed. */ -STATIC void -xlog_state_clean_iclog( +static void +xlog_state_activate_iclogs( struct xlog *log, - struct xlog_in_core *dirty_iclog) + int *iclogs_changed) { - struct xlog_in_core *iclog; - int changed = 0; - - /* Prepare the completed iclog. */ - if (dirty_iclog->ic_state != XLOG_STATE_IOERROR) - dirty_iclog->ic_state = XLOG_STATE_DIRTY; + struct xlog_in_core *iclog = log->l_iclog; - /* Walk all the iclogs to update the ordered active state. */ - iclog = log->l_iclog; do { - if (iclog->ic_state == XLOG_STATE_DIRTY) { - iclog->ic_state = XLOG_STATE_ACTIVE; - iclog->ic_offset = 0; - ASSERT(list_empty_careful(&iclog->ic_callbacks)); - /* - * If the number of ops in this iclog indicate it just - * contains the dummy transaction, we can - * change state into IDLE (the second time around). - * Otherwise we should change the state into - * NEED a dummy. - * We don't need to cover the dummy. - */ - if (!changed && - (be32_to_cpu(iclog->ic_header.h_num_logops) == - XLOG_COVER_OPS)) { - changed = 1; - } else { - /* - * We have two dirty iclogs so start over - * This could also be num of ops indicates - * this is not the dummy going out. - */ - changed = 2; - } - iclog->ic_header.h_num_logops = 0; - memset(iclog->ic_header.h_cycle_data, 0, - sizeof(iclog->ic_header.h_cycle_data)); - iclog->ic_header.h_lsn = 0; - } else if (iclog->ic_state == XLOG_STATE_ACTIVE) - /* do nothing */; - else - break; /* stop cleaning */ - iclog = iclog->ic_next; - } while (iclog != log->l_iclog); - + if (iclog->ic_state == XLOG_STATE_DIRTY) + xlog_state_activate_iclog(iclog, iclogs_changed); + /* + * The ordering of marking iclogs ACTIVE must be maintained, so + * an iclog doesn't become ACTIVE beyond one that is SYNCING. + */ + else if (iclog->ic_state != XLOG_STATE_ACTIVE) + break; + } while ((iclog = iclog->ic_next) != log->l_iclog); +} +static int +xlog_covered_state( + int prev_state, + int iclogs_changed) +{ /* - * Wake up threads waiting in xfs_log_force() for the dirty iclog - * to be cleaned. + * We usually go to NEED. But we go to NEED2 if the changed indicates we + * are done writing the dummy record. If we are done with the second + * dummy recored (DONE2), then we go to IDLE. */ - wake_up_all(&dirty_iclog->ic_force_wait); + switch (prev_state) { + case XLOG_STATE_COVER_IDLE: + case XLOG_STATE_COVER_NEED: + case XLOG_STATE_COVER_NEED2: + break; + case XLOG_STATE_COVER_DONE: + if (iclogs_changed == 1) + return XLOG_STATE_COVER_NEED2; + break; + case XLOG_STATE_COVER_DONE2: + if (iclogs_changed == 1) + return XLOG_STATE_COVER_IDLE; + break; + default: + ASSERT(0); + } - /* - * Change state for the dummy log recording. - * We usually go to NEED. But we go to NEED2 if the changed indicates - * we are done writing the dummy record. - * If we are done with the second dummy recored (DONE2), then - * we go to IDLE. - */ - if (changed) { - switch (log->l_covered_state) { - case XLOG_STATE_COVER_IDLE: - case XLOG_STATE_COVER_NEED: - case XLOG_STATE_COVER_NEED2: - log->l_covered_state = XLOG_STATE_COVER_NEED; - break; + return XLOG_STATE_COVER_NEED; +} - case XLOG_STATE_COVER_DONE: - if (changed == 1) - log->l_covered_state = XLOG_STATE_COVER_NEED2; - else - log->l_covered_state = XLOG_STATE_COVER_NEED; - break; +STATIC void +xlog_state_clean_iclog( + struct xlog *log, + struct xlog_in_core *dirty_iclog) +{ + int iclogs_changed = 0; - case XLOG_STATE_COVER_DONE2: - if (changed == 1) - log->l_covered_state = XLOG_STATE_COVER_IDLE; - else - log->l_covered_state = XLOG_STATE_COVER_NEED; - break; + dirty_iclog->ic_state = XLOG_STATE_DIRTY; - default: - ASSERT(0); - } + xlog_state_activate_iclogs(log, &iclogs_changed); + wake_up_all(&dirty_iclog->ic_force_wait); + + if (iclogs_changed) { + log->l_covered_state = xlog_covered_state(log->l_covered_state, + iclogs_changed); } } @@ -2808,8 +2687,7 @@ xlog_state_iodone_process_iclog( static void xlog_state_do_iclog_callbacks( struct xlog *log, - struct xlog_in_core *iclog, - bool aborted) + struct xlog_in_core *iclog) __releases(&log->l_icloglock) __acquires(&log->l_icloglock) { @@ -2821,7 +2699,7 @@ xlog_state_do_iclog_callbacks( list_splice_init(&iclog->ic_callbacks, &tmp); spin_unlock(&iclog->ic_callback_lock); - xlog_cil_process_committed(&tmp, aborted); + xlog_cil_process_committed(&tmp); spin_lock(&iclog->ic_callback_lock); } @@ -2836,8 +2714,7 @@ xlog_state_do_iclog_callbacks( STATIC void xlog_state_do_callback( - struct xlog *log, - bool aborted) + struct xlog *log) { struct xlog_in_core *iclog; struct xlog_in_core *first_iclog; @@ -2878,9 +2755,11 @@ xlog_state_do_callback( * we'll have to run at least one more complete loop. */ cycled_icloglock = true; - xlog_state_do_iclog_callbacks(log, iclog, aborted); - - xlog_state_clean_iclog(log, iclog); + xlog_state_do_iclog_callbacks(log, iclog); + if (XLOG_FORCED_SHUTDOWN(log)) + wake_up_all(&iclog->ic_force_wait); + else + xlog_state_clean_iclog(log, iclog); iclog = iclog->ic_next; } while (first_iclog != iclog); @@ -2916,25 +2795,22 @@ xlog_state_do_callback( */ STATIC void xlog_state_done_syncing( - struct xlog_in_core *iclog, - bool aborted) + struct xlog_in_core *iclog) { struct xlog *log = iclog->ic_log; spin_lock(&log->l_icloglock); - ASSERT(atomic_read(&iclog->ic_refcnt) == 0); /* * If we got an error, either on the first buffer, or in the case of - * split log writes, on the second, we mark ALL iclogs STATE_IOERROR, - * and none should ever be attempted to be written to disk - * again. + * split log writes, on the second, we shut down the file system and + * no iclogs should ever be attempted to be written to disk again. */ - if (iclog->ic_state == XLOG_STATE_SYNCING) + if (!XLOG_FORCED_SHUTDOWN(log)) { + ASSERT(iclog->ic_state == XLOG_STATE_SYNCING); iclog->ic_state = XLOG_STATE_DONE_SYNC; - else - ASSERT(iclog->ic_state == XLOG_STATE_IOERROR); + } /* * Someone could be sleeping prior to writing out the next @@ -2943,9 +2819,8 @@ xlog_state_done_syncing( */ wake_up_all(&iclog->ic_write_wait); spin_unlock(&log->l_icloglock); - xlog_state_do_callback(log, aborted); /* also cleans log */ -} /* xlog_state_done_syncing */ - + xlog_state_do_callback(log); +} /* * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must @@ -3064,21 +2939,21 @@ restart: *logoffsetp = log_offset; return 0; -} /* xlog_state_get_iclog_space */ - -/* The first cnt-1 times through here we don't need to - * move the grant write head because the permanent - * reservation has reserved cnt times the unit amount. - * Release part of current permanent unit reservation and - * reset current reservation to be one units worth. Also - * move grant reservation head forward. +} + +/* + * The first cnt-1 times a ticket goes through here we don't need to move the + * grant write head because the permanent reservation has reserved cnt times the + * unit amount. Release part of current permanent unit reservation and reset + * current reservation to be one units worth. Also move grant reservation head + * forward. */ -STATIC void -xlog_regrant_reserve_log_space( +void +xfs_log_ticket_regrant( struct xlog *log, struct xlog_ticket *ticket) { - trace_xfs_log_regrant_reserve_enter(log, ticket); + trace_xfs_log_ticket_regrant(log, ticket); if (ticket->t_cnt > 0) ticket->t_cnt--; @@ -3090,21 +2965,20 @@ xlog_regrant_reserve_log_space( ticket->t_curr_res = ticket->t_unit_res; xlog_tic_reset_res(ticket); - trace_xfs_log_regrant_reserve_sub(log, ticket); + trace_xfs_log_ticket_regrant_sub(log, ticket); /* just return if we still have some of the pre-reserved space */ - if (ticket->t_cnt > 0) - return; + if (!ticket->t_cnt) { + xlog_grant_add_space(log, &log->l_reserve_head.grant, + ticket->t_unit_res); + trace_xfs_log_ticket_regrant_exit(log, ticket); - xlog_grant_add_space(log, &log->l_reserve_head.grant, - ticket->t_unit_res); - - trace_xfs_log_regrant_reserve_exit(log, ticket); - - ticket->t_curr_res = ticket->t_unit_res; - xlog_tic_reset_res(ticket); -} /* xlog_regrant_reserve_log_space */ + ticket->t_curr_res = ticket->t_unit_res; + xlog_tic_reset_res(ticket); + } + xfs_log_ticket_put(ticket); +} /* * Give back the space left from a reservation. @@ -3120,18 +2994,19 @@ xlog_regrant_reserve_log_space( * space, the count will stay at zero and the only space remaining will be * in the current reservation field. */ -STATIC void -xlog_ungrant_log_space( +void +xfs_log_ticket_ungrant( struct xlog *log, struct xlog_ticket *ticket) { - int bytes; + int bytes; + + trace_xfs_log_ticket_ungrant(log, ticket); if (ticket->t_cnt > 0) ticket->t_cnt--; - trace_xfs_log_ungrant_enter(log, ticket); - trace_xfs_log_ungrant_sub(log, ticket); + trace_xfs_log_ticket_ungrant_sub(log, ticket); /* * If this is a permanent reservation ticket, we may be able to free @@ -3146,17 +3021,15 @@ xlog_ungrant_log_space( xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes); xlog_grant_sub_space(log, &log->l_write_head.grant, bytes); - trace_xfs_log_ungrant_exit(log, ticket); + trace_xfs_log_ticket_ungrant_exit(log, ticket); xfs_log_space_wake(log->l_mp); + xfs_log_ticket_put(ticket); } /* - * This routine will mark the current iclog in the ring as WANT_SYNC - * and move the current iclog pointer to the next iclog in the ring. - * When this routine is called from xlog_state_get_iclog_space(), the - * exact size of the iclog has not yet been determined. All we know is - * that every data block. We have run out of space in this log record. + * This routine will mark the current iclog in the ring as WANT_SYNC and move + * the current iclog pointer to the next iclog in the ring. */ STATIC void xlog_state_switch_iclogs( @@ -3165,6 +3038,8 @@ xlog_state_switch_iclogs( int eventual_size) { ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + assert_spin_locked(&log->l_icloglock); + if (!eventual_size) eventual_size = iclog->ic_offset; iclog->ic_state = XLOG_STATE_WANT_SYNC; @@ -3199,7 +3074,7 @@ xlog_state_switch_iclogs( } ASSERT(iclog == log->l_iclog); log->l_iclog = iclog->ic_next; -} /* xlog_state_switch_iclogs */ +} /* * Write out all data in the in-core log as of this exact moment in time. @@ -3259,9 +3134,6 @@ xfs_log_force( * previous iclog and go to sleep. */ iclog = iclog->ic_prev; - if (iclog->ic_state == XLOG_STATE_ACTIVE || - iclog->ic_state == XLOG_STATE_DIRTY) - goto out_unlock; } else if (iclog->ic_state == XLOG_STATE_ACTIVE) { if (atomic_read(&iclog->ic_refcnt) == 0) { /* @@ -3277,8 +3149,7 @@ xfs_log_force( if (xlog_state_release_iclog(log, iclog)) goto out_error; - if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn || - iclog->ic_state == XLOG_STATE_DIRTY) + if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) goto out_unlock; } else { /* @@ -3298,17 +3169,8 @@ xfs_log_force( ; } - if (!(flags & XFS_LOG_SYNC)) - goto out_unlock; - - if (iclog->ic_state == XLOG_STATE_IOERROR) - goto out_error; - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - if (iclog->ic_state == XLOG_STATE_IOERROR) - return -EIO; - return 0; - + if (flags & XFS_LOG_SYNC) + return xlog_wait_on_iclog(iclog); out_unlock: spin_unlock(&log->l_icloglock); return 0; @@ -3339,9 +3201,6 @@ __xfs_log_force_lsn( goto out_unlock; } - if (iclog->ic_state == XLOG_STATE_DIRTY) - goto out_unlock; - if (iclog->ic_state == XLOG_STATE_ACTIVE) { /* * We sleep here if we haven't already slept (e.g. this is the @@ -3375,20 +3234,8 @@ __xfs_log_force_lsn( *log_flushed = 1; } - if (!(flags & XFS_LOG_SYNC) || - (iclog->ic_state == XLOG_STATE_ACTIVE || - iclog->ic_state == XLOG_STATE_DIRTY)) - goto out_unlock; - - if (iclog->ic_state == XLOG_STATE_IOERROR) - goto out_error; - - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - if (iclog->ic_state == XLOG_STATE_IOERROR) - return -EIO; - return 0; - + if (flags & XFS_LOG_SYNC) + return xlog_wait_on_iclog(iclog); out_unlock: spin_unlock(&log->l_icloglock); return 0; @@ -3435,33 +3282,6 @@ xfs_log_force_lsn( } /* - * Called when we want to mark the current iclog as being ready to sync to - * disk. - */ -STATIC void -xlog_state_want_sync( - struct xlog *log, - struct xlog_in_core *iclog) -{ - assert_spin_locked(&log->l_icloglock); - - if (iclog->ic_state == XLOG_STATE_ACTIVE) { - xlog_state_switch_iclogs(log, iclog, 0); - } else { - ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || - iclog->ic_state == XLOG_STATE_IOERROR); - } -} - - -/***************************************************************************** - * - * TICKET functions - * - ***************************************************************************** - */ - -/* * Free a used ticket when its refcount falls to zero. */ void @@ -3609,7 +3429,6 @@ xlog_ticket_alloc( tic->t_ocnt = cnt; tic->t_tid = prandom_u32(); tic->t_clientid = client; - tic->t_flags = XLOG_TIC_INITED; if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; @@ -3618,13 +3437,6 @@ xlog_ticket_alloc( return tic; } - -/****************************************************************************** - * - * Log debug routines - * - ****************************************************************************** - */ #if defined(DEBUG) /* * Make sure that the destination ptr is within the valid data region of @@ -3710,7 +3522,7 @@ xlog_verify_tail_lsn( if (blocks < BTOBB(iclog->ic_offset) + 1) xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); } -} /* xlog_verify_tail_lsn */ +} /* * Perform a number of checks on the iclog before writing to disk. @@ -3813,7 +3625,7 @@ xlog_verify_iclog( } ptr += sizeof(xlog_op_header_t) + op_len; } -} /* xlog_verify_iclog */ +} #endif /* @@ -3937,7 +3749,7 @@ xfs_log_force_umount( spin_lock(&log->l_cilp->xc_push_lock); wake_up_all(&log->l_cilp->xc_commit_wait); spin_unlock(&log->l_cilp->xc_push_lock); - xlog_state_do_callback(log, true); + xlog_state_do_callback(log); /* return non-zero if log IOERROR transition had already happened */ return retval; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 84e06805160f..1412d6993f1e 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -105,10 +105,6 @@ struct xfs_log_item; struct xfs_item_ops; struct xfs_trans; -xfs_lsn_t xfs_log_done(struct xfs_mount *mp, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - bool regrant); int xfs_log_force(struct xfs_mount *mp, uint flags); int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags, int *log_forced); @@ -121,8 +117,7 @@ void xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); -int xfs_log_release_iclog(struct xfs_mount *mp, - struct xlog_in_core *iclog); +void xfs_log_release_iclog(struct xlog_in_core *iclog); int xfs_log_reserve(struct xfs_mount *mp, int length, int count, @@ -138,7 +133,7 @@ void xfs_log_ticket_put(struct xlog_ticket *ticket); void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, bool regrant); -void xlog_cil_process_committed(struct list_head *list, bool aborted); +void xlog_cil_process_committed(struct list_head *list); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 48435cf2aa16..b43f0e8f43f2 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -574,10 +574,10 @@ xlog_discard_busy_extents( */ static void xlog_cil_committed( - struct xfs_cil_ctx *ctx, - bool abort) + struct xfs_cil_ctx *ctx) { struct xfs_mount *mp = ctx->cil->xc_log->l_mp; + bool abort = XLOG_FORCED_SHUTDOWN(ctx->cil->xc_log); /* * If the I/O failed, we're aborting the commit and already shutdown. @@ -613,37 +613,38 @@ xlog_cil_committed( void xlog_cil_process_committed( - struct list_head *list, - bool aborted) + struct list_head *list) { struct xfs_cil_ctx *ctx; while ((ctx = list_first_entry_or_null(list, struct xfs_cil_ctx, iclog_entry))) { list_del(&ctx->iclog_entry); - xlog_cil_committed(ctx, aborted); + xlog_cil_committed(ctx); } } /* - * Push the Committed Item List to the log. If @push_seq flag is zero, then it - * is a background flush and so we can chose to ignore it. Otherwise, if the - * current sequence is the same as @push_seq we need to do a flush. If - * @push_seq is less than the current sequence, then it has already been + * Push the Committed Item List to the log. + * + * If the current sequence is the same as xc_push_seq we need to do a flush. If + * xc_push_seq is less than the current sequence, then it has already been * flushed and we don't need to do anything - the caller will wait for it to * complete if necessary. * - * @push_seq is a value rather than a flag because that allows us to do an - * unlocked check of the sequence number for a match. Hence we can allows log - * forces to run racily and not issue pushes for the same sequence twice. If we - * get a race between multiple pushes for the same sequence they will block on - * the first one and then abort, hence avoiding needless pushes. + * xc_push_seq is checked unlocked against the sequence number for a match. + * Hence we can allow log forces to run racily and not issue pushes for the + * same sequence twice. If we get a race between multiple pushes for the same + * sequence they will block on the first one and then abort, hence avoiding + * needless pushes. */ -STATIC int -xlog_cil_push( - struct xlog *log) +static void +xlog_cil_push_work( + struct work_struct *work) { - struct xfs_cil *cil = log->l_cilp; + struct xfs_cil *cil = + container_of(work, struct xfs_cil, xc_push_work); + struct xlog *log = cil->xc_log; struct xfs_log_vec *lv; struct xfs_cil_ctx *ctx; struct xfs_cil_ctx *new_ctx; @@ -657,9 +658,6 @@ xlog_cil_push( xfs_lsn_t commit_lsn; xfs_lsn_t push_seq; - if (!cil) - return 0; - new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); @@ -671,6 +669,11 @@ xlog_cil_push( ASSERT(push_seq <= ctx->sequence); /* + * Wake up any background push waiters now this context is being pushed. + */ + wake_up_all(&ctx->push_wait); + + /* * Check if we've anything to push. If there is nothing, then we don't * move on to a new sequence number and so we have to be able to push * this sequence again later. @@ -746,6 +749,7 @@ xlog_cil_push( */ INIT_LIST_HEAD(&new_ctx->committing); INIT_LIST_HEAD(&new_ctx->busy_extents); + init_waitqueue_head(&new_ctx->push_wait); new_ctx->sequence = ctx->sequence + 1; new_ctx->cil = cil; cil->xc_ctx = new_ctx; @@ -803,7 +807,7 @@ xlog_cil_push( lvhdr.lv_iovecp = &lhdr; lvhdr.lv_next = ctx->lv_chain; - error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); + error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true); if (error) goto out_abort_free_ticket; @@ -841,10 +845,11 @@ restart: } spin_unlock(&cil->xc_push_lock); - /* xfs_log_done always frees the ticket on error. */ - commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false); - if (commit_lsn == -1) - goto out_abort; + error = xlog_commit_record(log, tic, &commit_iclog, &commit_lsn); + if (error) + goto out_abort_free_ticket; + + xfs_log_ticket_ungrant(log, tic); spin_lock(&commit_iclog->ic_callback_lock); if (commit_iclog->ic_state == XLOG_STATE_IOERROR) { @@ -867,28 +872,20 @@ restart: spin_unlock(&cil->xc_push_lock); /* release the hounds! */ - return xfs_log_release_iclog(log->l_mp, commit_iclog); + xfs_log_release_iclog(commit_iclog); + return; out_skip: up_write(&cil->xc_ctx_lock); xfs_log_ticket_put(new_ctx->ticket); kmem_free(new_ctx); - return 0; + return; out_abort_free_ticket: - xfs_log_ticket_put(tic); + xfs_log_ticket_ungrant(log, tic); out_abort: - xlog_cil_committed(ctx, true); - return -EIO; -} - -static void -xlog_cil_push_work( - struct work_struct *work) -{ - struct xfs_cil *cil = container_of(work, struct xfs_cil, - xc_push_work); - xlog_cil_push(cil->xc_log); + ASSERT(XLOG_FORCED_SHUTDOWN(log)); + xlog_cil_committed(ctx); } /* @@ -900,7 +897,7 @@ xlog_cil_push_work( */ static void xlog_cil_push_background( - struct xlog *log) + struct xlog *log) __releases(cil->xc_ctx_lock) { struct xfs_cil *cil = log->l_cilp; @@ -914,14 +911,36 @@ xlog_cil_push_background( * don't do a background push if we haven't used up all the * space available yet. */ - if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) { + up_read(&cil->xc_ctx_lock); return; + } spin_lock(&cil->xc_push_lock); if (cil->xc_push_seq < cil->xc_current_sequence) { cil->xc_push_seq = cil->xc_current_sequence; queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); } + + /* + * Drop the context lock now, we can't hold that if we need to sleep + * because we are over the blocking threshold. The push_lock is still + * held, so blocking threshold sleep/wakeup is still correctly + * serialised here. + */ + up_read(&cil->xc_ctx_lock); + + /* + * If we are well over the space limit, throttle the work that is being + * done until the push work on this context has begun. + */ + if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) { + trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket); + ASSERT(cil->xc_ctx->space_used < log->l_logsize); + xlog_wait(&cil->xc_ctx->push_wait, &cil->xc_push_lock); + return; + } + spin_unlock(&cil->xc_push_lock); } @@ -1017,7 +1036,10 @@ xfs_log_commit_cil( if (commit_lsn) *commit_lsn = xc_commit_lsn; - xfs_log_done(mp, tp->t_ticket, NULL, regrant); + if (regrant && !XLOG_FORCED_SHUTDOWN(log)) + xfs_log_ticket_regrant(log, tp->t_ticket); + else + xfs_log_ticket_ungrant(log, tp->t_ticket); tp->t_ticket = NULL; xfs_trans_unreserve_and_mod_sb(tp); @@ -1038,9 +1060,9 @@ xfs_log_commit_cil( if (lip->li_ops->iop_committing) lip->li_ops->iop_committing(lip, xc_commit_lsn); } - xlog_cil_push_background(log); - up_read(&cil->xc_ctx_lock); + /* xlog_cil_push_background() releases cil->xc_ctx_lock */ + xlog_cil_push_background(log); } /* @@ -1199,6 +1221,7 @@ xlog_cil_init( INIT_LIST_HEAD(&ctx->committing); INIT_LIST_HEAD(&ctx->busy_extents); + init_waitqueue_head(&ctx->push_wait); ctx->sequence = 1; ctx->cil = cil; cil->xc_ctx = ctx; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index b192c5a9f9fd..ec22c7a3867f 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -51,13 +51,11 @@ enum xlog_iclog_state { }; /* - * Flags to log ticket + * Log ticket flags */ -#define XLOG_TIC_INITED 0x1 /* has been initialized */ -#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ +#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */ #define XLOG_TIC_FLAGS \ - { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } /* @@ -242,6 +240,7 @@ struct xfs_cil_ctx { struct xfs_log_vec *lv_chain; /* logvecs being pushed */ struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ + wait_queue_head_t push_wait; /* background push throttle */ struct work_struct discard_endio_work; }; @@ -318,13 +317,53 @@ struct xfs_cil { * tries to keep 25% of the log free, so we need to keep below that limit or we * risk running out of free log space to start any new transactions. * - * In order to keep background CIL push efficient, we will set a lower - * threshold at which background pushing is attempted without blocking current - * transaction commits. A separate, higher bound defines when CIL pushes are - * enforced to ensure we stay within our maximum checkpoint size bounds. - * threshold, yet give us plenty of space for aggregation on large logs. + * In order to keep background CIL push efficient, we only need to ensure the + * CIL is large enough to maintain sufficient in-memory relogging to avoid + * repeated physical writes of frequently modified metadata. If we allow the CIL + * to grow to a substantial fraction of the log, then we may be pinning hundreds + * of megabytes of metadata in memory until the CIL flushes. This can cause + * issues when we are running low on memory - pinned memory cannot be reclaimed, + * and the CIL consumes a lot of memory. Hence we need to set an upper physical + * size limit for the CIL that limits the maximum amount of memory pinned by the + * CIL but does not limit performance by reducing relogging efficiency + * significantly. + * + * As such, the CIL push threshold ends up being the smaller of two thresholds: + * - a threshold large enough that it allows CIL to be pushed and progress to be + * made without excessive blocking of incoming transaction commits. This is + * defined to be 12.5% of the log space - half the 25% push threshold of the + * AIL. + * - small enough that it doesn't pin excessive amounts of memory but maintains + * close to peak relogging efficiency. This is defined to be 16x the iclog + * buffer window (32MB) as measurements have shown this to be roughly the + * point of diminishing performance increases under highly concurrent + * modification workloads. + * + * To prevent the CIL from overflowing upper commit size bounds, we introduce a + * new threshold at which we block committing transactions until the background + * CIL commit commences and switches to a new context. While this is not a hard + * limit, it forces the process committing a transaction to the CIL to block and + * yeild the CPU, giving the CIL push work a chance to be scheduled and start + * work. This prevents a process running lots of transactions from overfilling + * the CIL because it is not yielding the CPU. We set the blocking limit at + * twice the background push space threshold so we keep in line with the AIL + * push thresholds. + * + * Note: this is not a -hard- limit as blocking is applied after the transaction + * is inserted into the CIL and the push has been triggered. It is largely a + * throttling mechanism that allows the CIL push to be scheduled and run. A hard + * limit will be difficult to implement without introducing global serialisation + * in the CIL commit fast path, and it's not at all clear that we actually need + * such hard limits given the ~7 years we've run without a hard limit before + * finding the first situation where a checkpoint size overflow actually + * occurred. Hence the simple throttle, and an ASSERT check to tell us that + * we've overrun the max size. */ -#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) +#define XLOG_CIL_SPACE_LIMIT(log) \ + min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4) + +#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \ + (XLOG_CIL_SPACE_LIMIT(log) * 2) /* * ticket grant locks, queues and accounting have their own cachlines @@ -402,7 +441,8 @@ struct xlog { #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) -#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) +#define XLOG_FORCED_SHUTDOWN(log) \ + (unlikely((log)->l_flags & XLOG_IO_ERROR)) /* common routines */ extern int @@ -438,14 +478,14 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); -int -xlog_write( - struct xlog *log, - struct xfs_log_vec *log_vector, - struct xlog_ticket *tic, - xfs_lsn_t *start_lsn, - struct xlog_in_core **commit_iclog, - uint flags); +int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, uint flags, + bool need_start_rec); +int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket, + struct xlog_in_core **iclog, xfs_lsn_t *lsn); +void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); +void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); /* * When we crack an atomic LSN, we sample it first so that the value will not @@ -525,12 +565,6 @@ xlog_cil_force(struct xlog *log) } /* - * Unmount record type is used as a pseudo transaction type for the ticket. - * It's value must be outside the range of XFS_TRANS_* values. - */ -#define XLOG_UNMOUNT_REC_TYPE (-1U) - -/* * Wrapper function for waiting on a wait queue serialised against wakeups * by a spinlock. This matches the semantics of all the wait queues used in the * log code. diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 25cfc85dbaa7..11c3502b07b1 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2869,8 +2869,8 @@ xfs_recover_inode_owner_change( return -ENOMEM; /* instantiate the inode */ + ASSERT(dip->di_version >= 3); xfs_inode_from_disk(ip, dip); - ASSERT(ip->i_d.di_version >= 3); error = xfs_iformat_fork(ip, dip); if (error) @@ -2997,7 +2997,7 @@ xlog_recover_inode_pass2( * superblock flag to determine whether we need to look at di_flushiter * to skip replay when the on disk inode is newer than the log one */ - if (!xfs_sb_version_hascrc(&mp->m_sb) && + if (!xfs_sb_version_has_v3inode(&mp->m_sb) && ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { /* * Deal with the wrap case, DI_MAX_FLUSH is less @@ -3068,7 +3068,7 @@ xlog_recover_inode_pass2( error = -EFSCORRUPTED; goto out_release; } - isize = xfs_log_dinode_size(ldip->di_version); + isize = xfs_log_dinode_size(mp); if (unlikely(item->ri_buf[1].i_len > isize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, ldip, @@ -4947,7 +4947,7 @@ xlog_recover_clear_agi_bucket( if (error) goto out_abort; - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket); @@ -5083,7 +5083,7 @@ xlog_recover_process_iunlinks( * buffer reference though, so that it stays pinned in memory * while we need the buffer. */ - agi = XFS_BUF_TO_AGI(agibp); + agi = agibp->b_addr; xfs_buf_unlock(agibp); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { @@ -5636,7 +5636,7 @@ xlog_do_recover( /* Convert superblock from on-disk format */ sbp = &mp->m_sb; - xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); + xfs_sb_from_disk(sbp, bp->b_addr); xfs_buf_relse(bp); /* re-initialise in-core superblock and geometry structures */ @@ -5809,7 +5809,6 @@ xlog_recover_check_summary( struct xlog *log) { xfs_mount_t *mp; - xfs_agf_t *agfp; xfs_buf_t *agfbp; xfs_buf_t *agibp; xfs_agnumber_t agno; @@ -5829,7 +5828,8 @@ xlog_recover_check_summary( xfs_alert(mp, "%s agf read failed agno %d error %d", __func__, agno, error); } else { - agfp = XFS_BUF_TO_AGF(agfbp); + struct xfs_agf *agfp = agfbp->b_addr; + freeblks += be32_to_cpu(agfp->agf_freeblks) + be32_to_cpu(agfp->agf_flcount); xfs_buf_relse(agfbp); @@ -5840,7 +5840,7 @@ xlog_recover_check_summary( xfs_alert(mp, "%s agi read failed agno %d error %d", __func__, agno, error); } else { - struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + struct xfs_agi *agi = agibp->b_addr; itotal += be32_to_cpu(agi->agi_count); ifree += be32_to_cpu(agi->agi_freecount); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 56efe140c923..c5513e5a226a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -310,7 +310,7 @@ reread: /* * Initialize the mount structure from the superblock. */ - xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); + xfs_sb_from_disk(sbp, bp->b_addr); /* * If we haven't validated the superblock, do so now before we try diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 88ab09ed29e7..50c43422fa17 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -167,6 +167,7 @@ typedef struct xfs_mount { struct xfs_kobj m_error_meta_kobj; struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; struct xstats m_stats; /* per-fs stats */ + struct ratelimit_state m_flush_inodes_ratelimit; struct workqueue_struct *m_buf_workqueue; struct workqueue_struct *m_unwritten_workqueue; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 0b0909657bad..c225691fad15 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -121,12 +121,11 @@ xfs_qm_dqpurge( { struct xfs_mount *mp = dqp->q_mount; struct xfs_quotainfo *qi = mp->m_quotainfo; + int error = -EAGAIN; xfs_dqlock(dqp); - if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { - xfs_dqunlock(dqp); - return -EAGAIN; - } + if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) + goto out_unlock; dqp->dq_flags |= XFS_DQ_FREEING; @@ -139,7 +138,6 @@ xfs_qm_dqpurge( */ if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; - int error; /* * We don't care about getting disk errors here. We need @@ -149,6 +147,8 @@ xfs_qm_dqpurge( if (!error) { error = xfs_bwrite(bp); xfs_buf_relse(bp); + } else if (error == -EAGAIN) { + goto out_unlock; } xfs_dqflock(dqp); } @@ -174,6 +174,10 @@ xfs_qm_dqpurge( xfs_qm_dqdestroy(dqp); return 0; + +out_unlock: + xfs_dqunlock(dqp); + return error; } /* @@ -326,16 +330,16 @@ xfs_qm_dqattach_locked( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { - error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, - doalloc, &ip->i_udquot); + error = xfs_qm_dqattach_one(ip, i_uid_read(VFS_I(ip)), + XFS_DQ_USER, doalloc, &ip->i_udquot); if (error) goto done; ASSERT(ip->i_udquot); } if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { - error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, - doalloc, &ip->i_gdquot); + error = xfs_qm_dqattach_one(ip, i_gid_read(VFS_I(ip)), + XFS_DQ_GROUP, doalloc, &ip->i_gdquot); if (error) goto done; ASSERT(ip->i_gdquot); @@ -871,12 +875,20 @@ xfs_qm_reset_dqcounts( ddq->d_bcount = 0; ddq->d_icount = 0; ddq->d_rtbcount = 0; - ddq->d_btimer = 0; - ddq->d_itimer = 0; - ddq->d_rtbtimer = 0; - ddq->d_bwarns = 0; - ddq->d_iwarns = 0; - ddq->d_rtbwarns = 0; + + /* + * dquot id 0 stores the default grace period and the maximum + * warning limit that were set by the administrator, so we + * should not reset them. + */ + if (ddq->d_id != 0) { + ddq->d_btimer = 0; + ddq->d_itimer = 0; + ddq->d_rtbtimer = 0; + ddq->d_bwarns = 0; + ddq->d_iwarns = 0; + ddq->d_rtbwarns = 0; + } if (xfs_sb_version_hascrc(&mp->m_sb)) { xfs_update_cksum((char *)&dqb[j], @@ -1613,8 +1625,8 @@ xfs_qm_dqfree_one( int xfs_qm_vop_dqalloc( struct xfs_inode *ip, - xfs_dqid_t uid, - xfs_dqid_t gid, + kuid_t uid, + kgid_t gid, prid_t prid, uint flags, struct xfs_dquot **O_udqpp, @@ -1622,6 +1634,8 @@ xfs_qm_vop_dqalloc( struct xfs_dquot **O_pdqpp) { struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + struct user_namespace *user_ns = inode->i_sb->s_user_ns; struct xfs_dquot *uq = NULL; struct xfs_dquot *gq = NULL; struct xfs_dquot *pq = NULL; @@ -1635,7 +1649,7 @@ xfs_qm_vop_dqalloc( xfs_ilock(ip, lockflags); if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip)) - gid = ip->i_d.di_gid; + gid = inode->i_gid; /* * Attach the dquot(s) to this inode, doing a dquot allocation @@ -1650,7 +1664,7 @@ xfs_qm_vop_dqalloc( } if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { - if (ip->i_d.di_uid != uid) { + if (!uid_eq(inode->i_uid, uid)) { /* * What we need is the dquot that has this uid, and * if we send the inode to dqget, the uid of the inode @@ -1661,7 +1675,8 @@ xfs_qm_vop_dqalloc( * holding ilock. */ xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, uid, XFS_DQ_USER, true, &uq); + error = xfs_qm_dqget(mp, from_kuid(user_ns, uid), + XFS_DQ_USER, true, &uq); if (error) { ASSERT(error != -ENOENT); return error; @@ -1682,9 +1697,10 @@ xfs_qm_vop_dqalloc( } } if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { - if (ip->i_d.di_gid != gid) { + if (!gid_eq(inode->i_gid, gid)) { xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, gid, XFS_DQ_GROUP, true, &gq); + error = xfs_qm_dqget(mp, from_kgid(user_ns, gid), + XFS_DQ_GROUP, true, &gq); if (error) { ASSERT(error != -ENOENT); goto error_rele; @@ -1810,7 +1826,7 @@ xfs_qm_vop_chown_reserve( XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; if (XFS_IS_UQUOTA_ON(mp) && udqp && - ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) { + i_uid_read(VFS_I(ip)) != be32_to_cpu(udqp->q_core.d_id)) { udq_delblks = udqp; /* * If there are delayed allocation blocks, then we have to @@ -1823,7 +1839,7 @@ xfs_qm_vop_chown_reserve( } } if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && - ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) { + i_gid_read(VFS_I(ip)) != be32_to_cpu(gdqp->q_core.d_id)) { gdq_delblks = gdqp; if (delblks) { ASSERT(ip->i_gdquot); @@ -1920,14 +1936,15 @@ xfs_qm_vop_create_dqattach( if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); - ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); + ASSERT(i_uid_read(VFS_I(ip)) == be32_to_cpu(udqp->q_core.d_id)); ip->i_udquot = xfs_qm_dqhold(udqp); xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } if (gdqp && XFS_IS_GQUOTA_ON(mp)) { ASSERT(ip->i_gdquot == NULL); - ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id)); + ASSERT(i_gid_read(VFS_I(ip)) == be32_to_cpu(gdqp->q_core.d_id)); + ip->i_gdquot = xfs_qm_dqhold(gdqp); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 1ea82764bf89..5d5ac65aa1cc 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -29,8 +29,6 @@ xfs_qm_log_quotaoff( int error; struct xfs_qoff_logitem *qoffi; - *qoffstartp = NULL; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp); if (error) goto out; @@ -62,7 +60,7 @@ out: STATIC int xfs_qm_log_quotaoff_end( struct xfs_mount *mp, - struct xfs_qoff_logitem *startqoff, + struct xfs_qoff_logitem **startqoff, uint flags) { struct xfs_trans *tp; @@ -73,9 +71,10 @@ xfs_qm_log_quotaoff_end( if (error) return error; - qoffi = xfs_trans_get_qoff_item(tp, startqoff, + qoffi = xfs_trans_get_qoff_item(tp, *startqoff, flags & XFS_ALL_QUOTA_ACCT); xfs_trans_log_quotaoff_item(tp, qoffi); + *startqoff = NULL; /* * We have to make sure that the transaction is secure on disk before we @@ -103,7 +102,7 @@ xfs_qm_scall_quotaoff( uint dqtype; int error; uint inactivate_flags; - struct xfs_qoff_logitem *qoffstart; + struct xfs_qoff_logitem *qoffstart = NULL; /* * No file system can have quotas enabled on disk but not in core. @@ -228,7 +227,7 @@ xfs_qm_scall_quotaoff( * So, we have QUOTAOFF start and end logitems; the start * logitem won't get overwritten until the end logitem appears... */ - error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags); + error = xfs_qm_log_quotaoff_end(mp, &qoffstart, flags); if (error) { /* We're screwed now. Shutdown is the only option. */ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); @@ -261,6 +260,8 @@ xfs_qm_scall_quotaoff( } out_unlock: + if (error && qoffstart) + xfs_qm_qoff_logitem_relse(qoffstart); mutex_unlock(&q->qi_quotaofflock); return error; } diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index efe42ae7a2f3..aa8fc1f55fbd 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -86,7 +86,7 @@ extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, struct xfs_mount *, struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint); -extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t, +extern int xfs_qm_vop_dqalloc(struct xfs_inode *, kuid_t, kgid_t, prid_t, uint, struct xfs_dquot **, struct xfs_dquot **, struct xfs_dquot **); extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *, @@ -109,7 +109,7 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *); #else static inline int -xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid, +xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, prid_t prid, uint flags, struct xfs_dquot **udqp, struct xfs_dquot **gdqp, struct xfs_dquot **pdqp) { diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 113883c4f202..f70f1255220b 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -57,13 +57,13 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) /* Loop over all stats groups */ for (i = j = 0; i < ARRAY_SIZE(xstats); i++) { - len += snprintf(buf + len, PATH_MAX - len, "%s", + len += scnprintf(buf + len, PATH_MAX - len, "%s", xstats[i].desc); /* inner loop does each group */ for (; j < xstats[i].endpoint; j++) - len += snprintf(buf + len, PATH_MAX - len, " %u", + len += scnprintf(buf + len, PATH_MAX - len, " %u", counter_val(stats, j)); - len += snprintf(buf + len, PATH_MAX - len, "\n"); + len += scnprintf(buf + len, PATH_MAX - len, "\n"); } /* extra precision counters */ for_each_possible_cpu(i) { @@ -72,9 +72,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes; } - len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", + len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); - len += snprintf(buf + len, PATH_MAX-len, "debug %u\n", + len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n", #if defined(DEBUG) 1); #else diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 2094386af8ac..abf06bf9c3f3 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -528,6 +528,9 @@ xfs_flush_inodes( { struct super_block *sb = mp->m_super; + if (!__ratelimit(&mp->m_flush_inodes_ratelimit)) + return; + if (down_read_trylock(&sb->s_umount)) { sync_inodes_sb(sb); up_read(&sb->s_umount); @@ -1366,6 +1369,17 @@ xfs_fc_fill_super( if (error) goto out_free_names; + /* + * Cap the number of invocations of xfs_flush_inodes to 16 for every + * quarter of a second. The magic numbers here were determined by + * observation neither to cause stalls in writeback when there are a + * lot of IO threads and the fs is near ENOSPC, nor cause any fstest + * regressions. YMMV. + */ + ratelimit_state_init(&mp->m_flush_inodes_ratelimit, HZ / 4, 16); + ratelimit_set_flags(&mp->m_flush_inodes_ratelimit, + RATELIMIT_MSG_ON_RELEASE); + error = xfs_init_mount_workqueues(mp); if (error) goto out_close_devices; @@ -1861,7 +1875,8 @@ xfs_init_zones(void) xfs_ili_zone = kmem_cache_create("xfs_ili", sizeof(struct xfs_inode_log_item), 0, - SLAB_MEM_SPREAD, NULL); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); if (!xfs_ili_zone) goto out_destroy_inode_zone; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index d762d42ed0ff..13fb4b919648 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -176,15 +176,12 @@ xfs_symlink( return -ENAMETOOLONG; ASSERT(pathlen > 0); - udqp = gdqp = NULL; prid = xfs_get_initial_prid(dp); /* * Make sure that we have allocated dquot(s) on disk. */ - error = xfs_qm_vop_dqalloc(dp, - xfs_kuid_to_uid(current_fsuid()), - xfs_kgid_to_gid(current_fsgid()), prid, + error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) @@ -194,7 +191,7 @@ xfs_symlink( * The symlink will fit into the inode data fork? * There can't be any attributes so we get the whole variable part. */ - if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version)) + if (pathlen <= XFS_LITINO(mp)) fs_blocks = 0; else fs_blocks = xfs_symlink_blocks(mp, pathlen); diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index bc85b89f88ca..120398a37c2a 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -6,6 +6,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_bit.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" @@ -27,6 +28,7 @@ #include "xfs_log_recover.h" #include "xfs_filestream.h" #include "xfs_fsmap.h" +#include "xfs_btree_staging.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e242988f57fb..a4323a63438d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -35,6 +35,12 @@ struct xfs_icreate_log; struct xfs_owner_info; struct xfs_trans_res; struct xfs_inobt_rec_incore; +union xfs_btree_ptr; + +#define XFS_ATTR_FILTER_FLAGS \ + { XFS_ATTR_ROOT, "ROOT" }, \ + { XFS_ATTR_SECURE, "SECURE" }, \ + { XFS_ATTR_INCOMPLETE, "INCOMPLETE" } DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -45,39 +51,39 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class, __field(u32, hashval) __field(u32, blkno) __field(u32, offset) - __field(void *, alist) + __field(void *, buffer) __field(int, bufsize) __field(int, count) __field(int, firstu) __field(int, dupcnt) - __field(int, flags) + __field(unsigned int, attr_filter) ), TP_fast_assign( __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; __entry->ino = ctx->dp->i_ino; - __entry->hashval = ctx->cursor->hashval; - __entry->blkno = ctx->cursor->blkno; - __entry->offset = ctx->cursor->offset; - __entry->alist = ctx->alist; + __entry->hashval = ctx->cursor.hashval; + __entry->blkno = ctx->cursor.blkno; + __entry->offset = ctx->cursor.offset; + __entry->buffer = ctx->buffer; __entry->bufsize = ctx->bufsize; __entry->count = ctx->count; __entry->firstu = ctx->firstu; - __entry->flags = ctx->flags; + __entry->attr_filter = ctx->attr_filter; ), TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " - "alist %p size %u count %u firstu %u flags %d %s", + "buffer %p size %u count %u firstu %u filter %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->hashval, __entry->blkno, __entry->offset, __entry->dupcnt, - __entry->alist, + __entry->buffer, __entry->bufsize, __entry->count, __entry->firstu, - __entry->flags, - __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS) + __print_flags(__entry->attr_filter, "|", + XFS_ATTR_FILTER_FLAGS) ) ) @@ -169,31 +175,31 @@ TRACE_EVENT(xfs_attr_list_node_descend, __field(u32, hashval) __field(u32, blkno) __field(u32, offset) - __field(void *, alist) + __field(void *, buffer) __field(int, bufsize) __field(int, count) __field(int, firstu) __field(int, dupcnt) - __field(int, flags) + __field(unsigned int, attr_filter) __field(u32, bt_hashval) __field(u32, bt_before) ), TP_fast_assign( __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; __entry->ino = ctx->dp->i_ino; - __entry->hashval = ctx->cursor->hashval; - __entry->blkno = ctx->cursor->blkno; - __entry->offset = ctx->cursor->offset; - __entry->alist = ctx->alist; + __entry->hashval = ctx->cursor.hashval; + __entry->blkno = ctx->cursor.blkno; + __entry->offset = ctx->cursor.offset; + __entry->buffer = ctx->buffer; __entry->bufsize = ctx->bufsize; __entry->count = ctx->count; __entry->firstu = ctx->firstu; - __entry->flags = ctx->flags; + __entry->attr_filter = ctx->attr_filter; __entry->bt_hashval = be32_to_cpu(btree->hashval); __entry->bt_before = be32_to_cpu(btree->before); ), TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " - "alist %p size %u count %u firstu %u flags %d %s " + "buffer %p size %u count %u firstu %u filter %s " "node hashval %u, node before %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, @@ -201,12 +207,12 @@ TRACE_EVENT(xfs_attr_list_node_descend, __entry->blkno, __entry->offset, __entry->dupcnt, - __entry->alist, + __entry->buffer, __entry->bufsize, __entry->count, __entry->firstu, - __entry->flags, - __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS), + __print_flags(__entry->attr_filter, "|", + XFS_ATTR_FILTER_FLAGS), __entry->bt_hashval, __entry->bt_before) ); @@ -995,8 +1001,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, DEFINE_EVENT(xfs_loggrant_class, name, \ TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \ TP_ARGS(log, tic)) -DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm); -DEFINE_LOGGRANT_EVENT(xfs_log_done_perm); DEFINE_LOGGRANT_EVENT(xfs_log_umount_write); DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep); DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake); @@ -1005,12 +1009,13 @@ DEFINE_LOGGRANT_EVENT(xfs_log_reserve); DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit); DEFINE_LOGGRANT_EVENT(xfs_log_regrant); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); -DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter); -DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); -DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait); DECLARE_EVENT_CLASS(xfs_log_item_class, TP_PROTO(struct xfs_log_item *lip), @@ -1701,7 +1706,8 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __field(int, namelen) __field(int, valuelen) __field(xfs_dahash_t, hashval) - __field(int, flags) + __field(unsigned int, attr_filter) + __field(unsigned int, attr_flags) __field(int, op_flags) ), TP_fast_assign( @@ -1712,11 +1718,12 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __entry->namelen = args->namelen; __entry->valuelen = args->valuelen; __entry->hashval = args->hashval; - __entry->flags = args->flags; + __entry->attr_filter = args->attr_filter; + __entry->attr_flags = args->attr_flags; __entry->op_flags = args->op_flags; ), TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d " - "hashval 0x%x flags %s op_flags %s", + "hashval 0x%x filter %s flags %s op_flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->namelen, @@ -1724,7 +1731,11 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __entry->namelen, __entry->valuelen, __entry->hashval, - __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS), + __print_flags(__entry->attr_filter, "|", + XFS_ATTR_FILTER_FLAGS), + __print_flags(__entry->attr_flags, "|", + { XATTR_CREATE, "CREATE" }, + { XATTR_REPLACE, "REPLACE" }), __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) ) @@ -3594,6 +3605,151 @@ TRACE_EVENT(xfs_check_new_dalign, __entry->calc_rootino) ) +TRACE_EVENT(xfs_btree_commit_afakeroot, + TP_PROTO(struct xfs_btree_cur *cur), + TP_ARGS(cur), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(unsigned int, levels) + __field(unsigned int, blocks) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->btnum = cur->bc_btnum; + __entry->agno = cur->bc_ag.agno; + __entry->agbno = cur->bc_ag.afake->af_root; + __entry->levels = cur->bc_ag.afake->af_levels; + __entry->blocks = cur->bc_ag.afake->af_blocks; + ), + TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->agno, + __entry->levels, + __entry->blocks, + __entry->agbno) +) + +TRACE_EVENT(xfs_btree_commit_ifakeroot, + TP_PROTO(struct xfs_btree_cur *cur), + TP_ARGS(cur), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(unsigned int, levels) + __field(unsigned int, blocks) + __field(int, whichfork) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->btnum = cur->bc_btnum; + __entry->agno = XFS_INO_TO_AGNO(cur->bc_mp, + cur->bc_ino.ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(cur->bc_mp, + cur->bc_ino.ip->i_ino); + __entry->levels = cur->bc_ino.ifake->if_levels; + __entry->blocks = cur->bc_ino.ifake->if_blocks; + __entry->whichfork = cur->bc_ino.whichfork; + ), + TP_printk("dev %d:%d btree %s ag %u agino %u whichfork %s levels %u blocks %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->agno, + __entry->agino, + __entry->whichfork == XFS_ATTR_FORK ? "attr" : "data", + __entry->levels, + __entry->blocks) +) + +TRACE_EVENT(xfs_btree_bload_level_geometry, + TP_PROTO(struct xfs_btree_cur *cur, unsigned int level, + uint64_t nr_this_level, unsigned int nr_per_block, + unsigned int desired_npb, uint64_t blocks, + uint64_t blocks_with_extra), + TP_ARGS(cur, level, nr_this_level, nr_per_block, desired_npb, blocks, + blocks_with_extra), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(unsigned int, level) + __field(unsigned int, nlevels) + __field(uint64_t, nr_this_level) + __field(unsigned int, nr_per_block) + __field(unsigned int, desired_npb) + __field(unsigned long long, blocks) + __field(unsigned long long, blocks_with_extra) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->nlevels = cur->bc_nlevels; + __entry->nr_this_level = nr_this_level; + __entry->nr_per_block = nr_per_block; + __entry->desired_npb = desired_npb; + __entry->blocks = blocks; + __entry->blocks_with_extra = blocks_with_extra; + ), + TP_printk("dev %d:%d btree %s level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->level, + __entry->nlevels, + __entry->nr_this_level, + __entry->nr_per_block, + __entry->desired_npb, + __entry->blocks, + __entry->blocks_with_extra) +) + +TRACE_EVENT(xfs_btree_bload_block, + TP_PROTO(struct xfs_btree_cur *cur, unsigned int level, + uint64_t block_idx, uint64_t nr_blocks, + union xfs_btree_ptr *ptr, unsigned int nr_records), + TP_ARGS(cur, level, block_idx, nr_blocks, ptr, nr_records), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_btnum_t, btnum) + __field(unsigned int, level) + __field(unsigned long long, block_idx) + __field(unsigned long long, nr_blocks) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(unsigned int, nr_records) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->btnum = cur->bc_btnum; + __entry->level = level; + __entry->block_idx = block_idx; + __entry->nr_blocks = nr_blocks; + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + xfs_fsblock_t fsb = be64_to_cpu(ptr->l); + + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb); + __entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsb); + } else { + __entry->agno = cur->bc_ag.agno; + __entry->agbno = be32_to_cpu(ptr->s); + } + __entry->nr_records = nr_records; + ), + TP_printk("dev %d:%d btree %s level %u block %llu/%llu fsb (%u/%u) recs %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->level, + __entry->block_idx, + __entry->nr_blocks, + __entry->agno, + __entry->agbno, + __entry->nr_records) +) + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 3b208f9a865c..28b983ff8b11 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -9,6 +9,7 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" +#include "xfs_log_priv.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_extent_busy.h" @@ -150,8 +151,9 @@ xfs_trans_reserve( uint blocks, uint rtextents) { - int error = 0; - bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + struct xfs_mount *mp = tp->t_mountp; + int error = 0; + bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; /* Mark this thread as being in a transaction */ current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); @@ -162,7 +164,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (blocks > 0) { - error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); + error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd); if (error != 0) { current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return -ENOSPC; @@ -191,9 +193,9 @@ xfs_trans_reserve( if (tp->t_ticket != NULL) { ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES); - error = xfs_log_regrant(tp->t_mountp, tp->t_ticket); + error = xfs_log_regrant(mp, tp->t_ticket); } else { - error = xfs_log_reserve(tp->t_mountp, + error = xfs_log_reserve(mp, resp->tr_logres, resp->tr_logcount, &tp->t_ticket, XFS_TRANSACTION, @@ -213,7 +215,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (rtextents > 0) { - error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents)); + error = xfs_mod_frextents(mp, -((int64_t)rtextents)); if (error) { error = -ENOSPC; goto undo_log; @@ -229,7 +231,7 @@ xfs_trans_reserve( */ undo_log: if (resp->tr_logres > 0) { - xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false); + xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); tp->t_ticket = NULL; tp->t_log_res = 0; tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; @@ -237,7 +239,7 @@ undo_log: undo_blocks: if (blocks > 0) { - xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd); + xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd); tp->t_blk_res = 0; } @@ -306,6 +308,11 @@ xfs_trans_alloc( * * Note the zero-length reservation; this transaction MUST be cancelled * without any dirty data. + * + * Callers should obtain freeze protection to avoid two conflicts with fs + * freezing: (1) having active transactions trip the m_active_trans ASSERTs; + * and (2) grabbing buffers at the same time that freeze is trying to drain + * the buffer LRU list. */ int xfs_trans_alloc_empty( @@ -450,7 +457,7 @@ xfs_trans_apply_sb_deltas( int whole = 0; bp = xfs_trans_getsb(tp, tp->t_mountp); - sbp = XFS_BUF_TO_SBP(bp); + sbp = bp->b_addr; /* * Check that superblock mods match the mods made to AGF counters. @@ -999,9 +1006,10 @@ out_unreserve: */ xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant); - if (commit_lsn == -1 && !error) - error = -EIO; + if (regrant && !XLOG_FORCED_SHUTDOWN(mp->m_log)) + xfs_log_ticket_regrant(mp->m_log, tp->t_ticket); + else + xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); tp->t_ticket = NULL; } current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); @@ -1060,7 +1068,7 @@ xfs_trans_cancel( xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - xfs_log_done(mp, tp->t_ticket, NULL, false); + xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); tp->t_ticket = NULL; } diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 00cc5b8734be..564253550b75 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -32,6 +32,7 @@ STATIC void xfs_ail_check( struct xfs_ail *ailp, struct xfs_log_item *lip) + __must_hold(&ailp->ail_lock) { struct xfs_log_item *prev_lip; struct xfs_log_item *next_lip; @@ -108,17 +109,25 @@ xfs_ail_next( * We need the AIL lock in order to get a coherent read of the lsn of the last * item in the AIL. */ +static xfs_lsn_t +__xfs_ail_min_lsn( + struct xfs_ail *ailp) +{ + struct xfs_log_item *lip = xfs_ail_min(ailp); + + if (lip) + return lip->li_lsn; + return 0; +} + xfs_lsn_t xfs_ail_min_lsn( struct xfs_ail *ailp) { - xfs_lsn_t lsn = 0; - struct xfs_log_item *lip; + xfs_lsn_t lsn; spin_lock(&ailp->ail_lock); - lip = xfs_ail_min(ailp); - if (lip) - lsn = lip->li_lsn; + lsn = __xfs_ail_min_lsn(ailp); spin_unlock(&ailp->ail_lock); return lsn; @@ -529,8 +538,9 @@ xfsaild( { struct xfs_ail *ailp = data; long tout = 0; /* milliseconds */ + unsigned int noreclaim_flag; - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); set_freezable(); while (1) { @@ -601,6 +611,7 @@ xfsaild( tout = xfsaild_push(ailp); } + memalloc_noreclaim_restore(noreclaim_flag); return 0; } @@ -678,6 +689,28 @@ xfs_ail_push_all_sync( finish_wait(&ailp->ail_empty, &wait); } +void +xfs_ail_update_finish( + struct xfs_ail *ailp, + xfs_lsn_t old_lsn) __releases(ailp->ail_lock) +{ + struct xfs_mount *mp = ailp->ail_mount; + + /* if the tail lsn hasn't changed, don't do updates or wakeups. */ + if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) { + spin_unlock(&ailp->ail_lock); + return; + } + + if (!XFS_FORCED_SHUTDOWN(mp)) + xlog_assign_tail_lsn_locked(mp); + + if (list_empty(&ailp->ail_head)) + wake_up_all(&ailp->ail_empty); + spin_unlock(&ailp->ail_lock); + xfs_log_space_wake(mp); +} + /* * xfs_trans_ail_update - bulk AIL insertion operation. * @@ -709,7 +742,7 @@ xfs_trans_ail_update_bulk( xfs_lsn_t lsn) __releases(ailp->ail_lock) { struct xfs_log_item *mlip; - int mlip_changed = 0; + xfs_lsn_t tail_lsn = 0; int i; LIST_HEAD(tmp); @@ -724,9 +757,10 @@ xfs_trans_ail_update_bulk( continue; trace_xfs_ail_move(lip, lip->li_lsn, lsn); + if (mlip == lip && !tail_lsn) + tail_lsn = lip->li_lsn; + xfs_ail_delete(ailp, lip); - if (mlip == lip) - mlip_changed = 1; } else { trace_xfs_ail_insert(lip, 0, lsn); } @@ -737,23 +771,23 @@ xfs_trans_ail_update_bulk( if (!list_empty(&tmp)) xfs_ail_splice(ailp, cur, &tmp, lsn); - if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount)) - xlog_assign_tail_lsn_locked(ailp->ail_mount); - spin_unlock(&ailp->ail_lock); - - xfs_log_space_wake(ailp->ail_mount); - } else { - spin_unlock(&ailp->ail_lock); - } + xfs_ail_update_finish(ailp, tail_lsn); } -bool +/* + * Delete one log item from the AIL. + * + * If this item was at the tail of the AIL, return the LSN of the log item so + * that we can use it to check if the LSN of the tail of the log has moved + * when finishing up the AIL delete process in xfs_ail_update_finish(). + */ +xfs_lsn_t xfs_ail_delete_one( struct xfs_ail *ailp, struct xfs_log_item *lip) { struct xfs_log_item *mlip = xfs_ail_min(ailp); + xfs_lsn_t lsn = lip->li_lsn; trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); xfs_ail_delete(ailp, lip); @@ -761,7 +795,9 @@ xfs_ail_delete_one( clear_bit(XFS_LI_IN_AIL, &lip->li_flags); lip->li_lsn = 0; - return mlip == lip; + if (mlip == lip) + return lsn; + return 0; } /** @@ -789,10 +825,10 @@ void xfs_trans_ail_delete( struct xfs_ail *ailp, struct xfs_log_item *lip, - int shutdown_type) __releases(ailp->ail_lock) + int shutdown_type) { struct xfs_mount *mp = ailp->ail_mount; - bool mlip_changed; + xfs_lsn_t tail_lsn; if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); @@ -805,17 +841,8 @@ xfs_trans_ail_delete( return; } - mlip_changed = xfs_ail_delete_one(ailp, lip); - if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(mp)) - xlog_assign_tail_lsn_locked(mp); - if (list_empty(&ailp->ail_head)) - wake_up_all(&ailp->ail_empty); - } - - spin_unlock(&ailp->ail_lock); - if (mlip_changed) - xfs_log_space_wake(ailp->ail_mount); + tail_lsn = xfs_ail_delete_one(ailp, lip); + xfs_ail_update_finish(ailp, tail_lsn); } int diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 2e073c1c4614..35655eac01a6 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -91,9 +91,11 @@ xfs_trans_ail_update( xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); } -bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); +xfs_lsn_t xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); +void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn) + __releases(ailp->ail_lock); void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, - int shutdown_type) __releases(ailp->ail_lock); + int shutdown_type); static inline void xfs_trans_ail_remove( diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index b0fedb543f97..fc5d7276026e 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -12,53 +12,30 @@ #include "xfs_inode.h" #include "xfs_attr.h" #include "xfs_acl.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include <linux/posix_acl_xattr.h> -#include <linux/xattr.h> static int xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *value, size_t size) { - int xflags = handler->flags; - struct xfs_inode *ip = XFS_I(inode); - int error, asize = size; - size_t namelen = strlen(name); - - /* Convert Linux syscall to XFS internal ATTR flags */ - if (!size) { - xflags |= ATTR_KERNOVAL; - value = NULL; - } + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = handler->flags, + .name = name, + .namelen = strlen(name), + .value = value, + .valuelen = size, + }; + int error; - error = xfs_attr_get(ip, name, namelen, (unsigned char **)&value, - &asize, xflags); + error = xfs_attr_get(&args); if (error) return error; - return asize; -} - -void -xfs_forget_acl( - struct inode *inode, - const char *name, - int xflags) -{ - /* - * Invalidate any cached ACLs if the user has bypassed the ACL - * interface. We don't validate the content whatsoever so it is caller - * responsibility to provide data in valid format and ensure i_mode is - * consistent. - */ - if (xflags & ATTR_ROOT) { -#ifdef CONFIG_XFS_POSIX_ACL - if (!strcmp(name, SGI_ACL_FILE)) - forget_cached_acl(inode, ACL_TYPE_ACCESS); - else if (!strcmp(name, SGI_ACL_DEFAULT)) - forget_cached_acl(inode, ACL_TYPE_DEFAULT); -#endif - } + return args.valuelen; } static int @@ -66,25 +43,20 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { - int xflags = handler->flags; - struct xfs_inode *ip = XFS_I(inode); + struct xfs_da_args args = { + .dp = XFS_I(inode), + .attr_filter = handler->flags, + .attr_flags = flags, + .name = name, + .namelen = strlen(name), + .value = (void *)value, + .valuelen = size, + }; int error; - size_t namelen = strlen(name); - - /* Convert Linux syscall to XFS internal ATTR flags */ - if (flags & XATTR_CREATE) - xflags |= ATTR_CREATE; - if (flags & XATTR_REPLACE) - xflags |= ATTR_REPLACE; - - if (value) - error = xfs_attr_set(ip, name, namelen, (void *)value, size, - xflags); - else - error = xfs_attr_remove(ip, name, namelen, xflags); - if (!error) - xfs_forget_acl(inode, name, xflags); + error = xfs_attr_set(&args); + if (!error && (handler->flags & XFS_ATTR_ROOT)) + xfs_forget_acl(inode, name); return error; } @@ -97,14 +69,14 @@ static const struct xattr_handler xfs_xattr_user_handler = { static const struct xattr_handler xfs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, - .flags = ATTR_ROOT, + .flags = XFS_ATTR_ROOT, .get = xfs_xattr_get, .set = xfs_xattr_set, }; static const struct xattr_handler xfs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .flags = ATTR_SECURE, + .flags = XFS_ATTR_SECURE, .get = xfs_xattr_get, .set = xfs_xattr_set, }; @@ -134,7 +106,7 @@ __xfs_xattr_put_listent( if (context->count < 0 || context->seen_enough) return; - if (!context->alist) + if (!context->buffer) goto compute_size; arraytop = context->count + prefix_len + namelen + 1; @@ -143,7 +115,7 @@ __xfs_xattr_put_listent( context->seen_enough = 1; return; } - offset = (char *)context->alist + context->count; + offset = context->buffer + context->count; strncpy(offset, prefix, prefix_len); offset += prefix_len; strncpy(offset, (char *)name, namelen); /* real name */ @@ -218,7 +190,6 @@ xfs_vn_listxattr( size_t size) { struct xfs_attr_list_context context; - struct attrlist_cursor_kern cursor = { 0 }; struct inode *inode = d_inode(dentry); int error; @@ -227,14 +198,13 @@ xfs_vn_listxattr( */ memset(&context, 0, sizeof(context)); context.dp = XFS_I(inode); - context.cursor = &cursor; context.resynch = 1; - context.alist = size ? data : NULL; + context.buffer = size ? data : NULL; context.bufsize = size; context.firstu = context.bufsize; context.put_listent = xfs_xattr_put_listent; - error = xfs_attr_list_int(&context); + error = xfs_attr_list(&context); if (error) return error; if (context.count < 0) |